Polysubstance and tr. completion
Analyze the association between polysubstance at admission and tr. compleiton longitudinally along treatments, accounting for irregular observations.
Data Loading and Exploration
Loading Packages and uniting databases
Proceed to load the necessary packages.
Code
# invisible("Only run from Ubuntu")
# if (!(Sys.getenv("RSTUDIO_SESSION_TYPE") == "server" || file.exists("/.dockerenv"))) {
# if(Sys.info()["sysname"]!="Windows"){
# Sys.setenv(RETICULATE_PYTHON = "/home/fondecytacc/.pyenv/versions/3.11.5/bin/python")
# }
# }
#clean enviroment
rm(list = ls()); gc()
time_before_dedup2<-Sys.time()
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
# --- Bootstrap reticulate con ruta relativa a getwd() ---
suppressPackageStartupMessages(library(reticulate))
# Busca .mamba_root/envs/py311/python.exe desde getwd() hacia padres
find_python_rel <- function(start = getwd(),
rel = file.path(".mamba_root","envs","py311","python.exe")) {
cur <- normalizePath(start, winslash = "/", mustWork = FALSE)
repeat {
cand <- normalizePath(file.path(cur, rel), winslash = "/", mustWork = FALSE)
if (file.exists(cand)) return(cand)
parent <- dirname(cur)
if (identical(parent, cur)) return(NA_character_) # llegó a la raíz
cur <- parent
}
}
py <- find_python_rel()
if (is.na(py)) {
stop("No se encontró Python relativo a getwd() (buscando '.mamba_root/envs/py311/python.exe').\n",
"Directorio actual: ", getwd())
}
# Forzar ese intérprete
Sys.unsetenv(c("RETICULATE_CONDAENV","RETICULATE_PYTHON_FALLBACK"))
Sys.setenv(RETICULATE_PYTHON = py)
reticulate::use_python(py, required=T)
py_config() # verificación
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
wdpath<-
paste0(gsub("/cons","",gsub("cons","",paste0(getwd(),"/cons"))))
try(load(paste0(wdpath,"data/20241015_out/psu/","SISTRAT_database_sep25_prevt.RData")))
SISTRAT23_c1_2010_2024_df_prev1w<-
try(readRDS(paste0(wdpath,"data/20241015_out/psu/","SISTRAT23_c1_2010_2024_df_prev1w.rds")))
df23_ndp_20250824_SISTRAT23_c1_1024<-
readRDS(paste0(wdpath,"data/20241015_out/psu/","23_ndp_20250824_SISTRAT23_c1_1024_df2.rds")) used (Mb) gc trigger (Mb) max used (Mb)
Ncells 601815 32.2 1294412 69.2 965217 51.6
Vcells 1156319 8.9 8388608 64.0 1876154 14.4
python: G:/My Drive/Alvacast/SISTRAT 2023/.mamba_root/envs/py311/python.exe
libpython: G:/My Drive/Alvacast/SISTRAT 2023/.mamba_root/envs/py311/python311.dll
pythonhome: G:/My Drive/Alvacast/SISTRAT 2023/.mamba_root/envs/py311
version: 3.11.5 | packaged by conda-forge | (main, Aug 27 2023, 03:23:48) [MSC v.1936 64 bit (AMD64)]
Architecture: 64bit
numpy: [NOT FOUND]
NOTE: Python version was forced by RETICULATE_PYTHON
Code
#https://github.com/rstudio/renv/issues/544
#renv falls back to copying rather than symlinking, which is evidently very slow in this configuration.
renv::settings$use.cache(FALSE)
#only use explicit dependencies (in DESCRIPTION)
renv::settings$snapshot.type("implicit")
#check if rstools is installed
try(installr::install.Rtools(check_r_update=F))Code
check_quarto_version <- function(required = "1.7.29", comparator = c("ge","gt","le","lt","eq")) {
comparator <- match.arg(comparator)
current <- package_version(paste(unlist(quarto::quarto_version()), collapse = "."))
req <- package_version(required)
ok <- switch(comparator,
ge = current >= req,
gt = current > req,
le = current <= req,
lt = current < req,
eq = current == req)
if (!ok) {
stop(sprintf("Quarto version check failed: need %s %s (installed: %s).",
comparator, required, current), call. = FALSE)
}
invisible(TRUE)
}
check_quarto_version("1.7.29", "ge")
#change repository to CL
local({
r <- getOption("repos")
r["CRAN"] <- "https://cran.dcc.uchile.cl/"
options(repos=r)
})
if(!require(pacman)){install.packages("pacman");require(pacman)}Code
if(!require(pak)){install.packages("pak");require(pak)}Code
pacman::p_unlock(lib.loc = .libPaths()) #para no tener problemas reinstalando paquetesCode
if(Sys.info()["sysname"]=="Windows"){
if (getRversion() != "4.4.1") { stop("Requires R version 4.4.1; Actual: ", getRversion()) }
}
#check docker
check_docker_running <- function() {
# Try running 'docker info' to check if Docker is running
system("docker info", intern = TRUE, ignore.stderr = TRUE)
}
install_docker <- function() {
# Open the Docker Desktop download page in the browser for installation
browseURL("https://www.docker.com/products/docker-desktop")
}
# Main logic
if (inherits(try(check_docker_running(), silent = TRUE), "try-error")) {
liftr::install_docker()
} else {
message("Docker is running.")
}Warning in system(“docker info”, intern = TRUE, ignore.stderr = TRUE): el comando ejecutado ‘docker info’ tiene el estatus 1
Code
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#PACKAGES#######################################################################
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
unlink("*_cache", recursive=T)
# ----------------------------------------------------------------------
# 2. Use a single pak::pkg_install() call for most CRAN packages
# ----------------------------------------------------------------------
paks <-
c(#"git",
# To connect to github
"gh", #interface for GitHub API from R
#
"gitcreds", # manages Git credentials (usernames, passwords, tokens)
#
"usethis", # simplifies common project setup tasks for R developers
# Package to bring packages in development
"devtools",
# Package administration
"renv",
# To manipulate data
"knitr", "pander", "DT",
# Join
"fuzzyjoin", "RecordLinkage",
# For tables
"tidyverse", "janitor",
# For contingency tables
"kableExtra",
# For connections with python
"reticulate",
# To manipulate big data
"polars", "sqldf",
# To bring big databases
"nanoparquet",
# Interface for R and RStudio in R
"installr", "rmarkdown", "quarto", "yaml", #"rstudioapi",
# Time handling
"clock",
# Combine plots
"ggpubr",
# Parallelized iterative processing
"furrr",
# Work like a tibble with a data.table database
"tidytable",
# Split database into training and testing
"caret",
# Impute missing data
"missRanger", "mice",
# To modularize tasks
"job",
# For PhantomJS install checks
"webshot"
)
# dplyr
# janitor
# reshape2
# tidytable
# arrow
# boot
# broom
# car
# caret
# data.table
# DiagrammeR
# DiagrammeRsvg
# dplyr
# epiR
# epitools
# ggplot2
# glue
# htmlwidgets
# knitr
# lubridate
# naniar
# parallel
# polycor
# pROC
# psych
# readr
# rio
# rsvg
# scales
# stringr
# tableone
# rmarkdown
# biostat3
# codebook
# finalfit
# Hmisc
# kableExtra
# knitr
# devtools
# tidyr
# stringi
# stringr
# muhaz
# sqldf
# compareGroups
# survminer
# lubridate
# ggfortify
# car
# fuzzyjoin
# compareGroups
# caret
# job
# htmltools
# nanoparquet
# ggpubr
# polars
# installr
# clock
# pander
# reshape
# mice
# missRanger
# VIM
# withr
# biostat3
# broom
# glue
# finalfit
# purrr
# sf
# pak::pkg_install(paks)
pak::pak_sitrep()
# pak::sysreqs_check_installed(unique(unlist(paks)))
#pak::lockfile_create(unique(unlist(paks)), "dependencies_duplicates24.lock", dependencies=T)
#pak::lockfile_install("dependencies_duplicates24.lock")
#https://rdrr.io/cran/pak/man/faq.html
#pak::cache_delete()
library(tidytable)Code
library(ggplot2)
library(readr)
library(survival)
cat("Install IrregLong\n")
if (!requireNamespace("IrregLong", quietly = TRUE)) {
install.packages("IrregLong")
}
cat("Install geepack\n")
if (!requireNamespace("geepack", quietly = TRUE)) {
install.packages("geepack")
}
# ----------------------------------------------------------------------
# 3. Activate polars code completion (safe to try even if it fails)
# ----------------------------------------------------------------------
#try(polars_code_completion_activate())
# ----------------------------------------------------------------------
# 4. BPMN from GitHub (not on CRAN, so install via devtools if missing)
# ----------------------------------------------------------------------
if (!requireNamespace("bpmn", quietly = TRUE)) {
devtools::install_github("bergant/bpmn")
}
# ----------------------------------------------------------------------
# 5. PhantomJS Check (use webshot if PhantomJS is missing)
# ----------------------------------------------------------------------
# if (!webshot::is_phantomjs_installed()) {
# webshot::install_phantomjs()
# }
# Memory management function
manage_memory <- function() {
gc()
if (.Platform$OS.type == "windows") {
tryCatch({
memory.limit(size = memory.limit() + 1000)
}, error = function(e) NULL)
}
}
manage_memory()Warning: ‘memory.limit()’ is no longer supported
Code
check_memory <- function() {
if (.Platform$OS.type == "windows") {
mem <- memory.size()
limit <- memory.limit()
log_message(paste("Memory usage:", round(mem, 1), "MB /", limit, "MB"))
return(mem/limit > 0.8)
}
return(FALSE)
}
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#FUNCTIONS######################################################################
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#
# NO MORE DEBUGS
options(error = NULL) # si antes tenías options(error = recover) o browser)
options(browserNLdisabled = FALSE)
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#
#NAs are replaced with "" in knitr kable
options(knitr.kable.NA = '')
pander::panderOptions('big.mark', ',')
pander::panderOptions('decimal.mark', '.')
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#
#to format rows in bold
format_cells <- function(df, rows ,cols, value = c("italics", "bold", "strikethrough")){
# select the correct markup
# one * for italics, two ** for bold
map <- setNames(c("*", "**", "~~"), c("italics", "bold", "strikethrough"))
markup <- map[value]
for (r in rows){
for(c in cols){
# Make sure values are not factors
df[[c]] <- as.character( df[[c]])
# Update formatting
df[r, c] <- ifelse(nchar(df[r, c])==0,"",paste0(markup, gsub(" ", "", df[r, c]), markup))
}
}
return(df)
}
#To produce line breaks in messages and warnings
knitr::knit_hooks$set(
error = function(x, options) {
paste('\n\n<div class="alert alert-danger" style="font-size: small !important;">',
gsub('##', '\n', gsub('^##\ Error', '**Error**', x)),
'</div>', sep = '\n')
},
warning = function(x, options) {
paste('\n\n<div class="alert alert-warning" style="font-size: small !important;">',
gsub('##', '\n', gsub('^##\ Warning:', '**Warning**', x)),
'</div>', sep = '\n')
},
message = function(x, options) {
paste('<div class="message" style="font-size: small !important;">',
gsub('##', '\n', x),
'</div>', sep = '\n')
}
)
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#
sum_dates <- function(x){
cbind.data.frame(
min= as.Date(min(unclass(as.Date(x)), na.rm=T), origin = "1970-01-01"),
p001= as.Date(quantile(unclass(as.Date(x)), .001, na.rm=T), origin = "1970-01-01"),
p005= as.Date(quantile(unclass(as.Date(x)), .005, na.rm=T), origin = "1970-01-01"),
p025= as.Date(quantile(unclass(as.Date(x)), .025, na.rm=T), origin = "1970-01-01"),
p25= as.Date(quantile(unclass(as.Date(x)), .25, na.rm=T), origin = "1970-01-01"),
p50= as.Date(quantile(unclass(as.Date(x)), .5, na.rm=T), origin = "1970-01-01"),
p75= as.Date(quantile(unclass(as.Date(x)), .75, na.rm=T), origin = "1970-01-01"),
p975= as.Date(quantile(unclass(as.Date(x)), .975, na.rm=T), origin = "1970-01-01"),
p995= as.Date(quantile(unclass(as.Date(x)), .995, na.rm=T), origin = "1970-01-01"),
p999= as.Date(quantile(unclass(as.Date(x)), .999, na.rm=T), origin = "1970-01-01"),
max= as.Date(max(unclass(as.Date(x)), na.rm=T), origin = "1970-01-01")
)
}
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
# Define the function adapted for Polars
sum_dates_polars <- function(df, date_col) {
# Create the list of quantiles
quantiles <- c(0.001, 0.005, 0.025, 0.25, 0.5, 0.75, 0.975, 0.995, 0.999)
# Create expressions to calculate min and max
expr_list <- list(
pl$col(date_col)$min()$alias("min"),
pl$col(date_col)$max()$alias("max")
)
# Add expressions for quantiles
for (q in quantiles) {
expr_list <- append(expr_list, pl$col(date_col)$quantile(q)$alias(paste0("p", sub("\\.", "", as.character(q)))))
}
# Apply the expressions and return a DataFrame with the results
df$select(expr_list)
}
# Custom function for sampling with a seed
sample_n_with_seed <- function(data, size, seed) {
set.seed(seed)
dplyr::sample_n(data, size)
}
# Function to get the most frequent value
most_frequent <- function(x) {
uniq_vals <- unique(x)
freq_vals <- sapply(uniq_vals, function(val) sum(x == val))
most_freq <- uniq_vals[which(freq_vals == max(freq_vals))]
if (length(most_freq) == 1) {
return(most_freq)
} else {
return(NA)
}
}
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#CONFIG #######################################################################
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
options(scipen=2) #display numbers rather scientific number
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
# Define the function first
#joins these values with semicolons and optionally truncates the result if it exceeds a specified width.
toString2 <- function(x, width = NULL, ...) {
string <- paste(x, collapse = "; ")
if (missing(width) || is.null(width) || width == 0)
return(string)
if (width < 0)
stop("'width' must be positive")
if (nchar(string, type = "w") > width) {
width <- max(6, width)
string <- paste0(substr(string, 1, width - 3), "...")
}
string
}
normalize_txt <- function(x) {
x|>
stringi::stri_trans_general("Latin-ASCII")|>
tolower()|>
trimws()
}
icd10_broad <- function(x) {
x <- tolower(x)
tidytable::case_when(
stringr::str_detect(x, "organic|organico|demenc|alzheimer|parkinson|delirium|cerebral") ~ "F0 Organic",
stringr::str_detect(x, "psicoactiva|alcohol|marihuana|canabis|cannabis|cocain|opio|opiace|benzodiazep|sustancias") ~ "F1 Substance use",
stringr::str_detect(x, "esquizofren|psicotip|delirant|psicosis") ~ "F2 Psychotic",
stringr::str_detect(x, "estado de animo|afectiv|depres|bipolar|maniaco|distimia|hipoman") ~ "F3 Mood",
stringr::str_detect(x, "neurotic|ansiedad|fobi|panico|obsesivo|compulsiv|estres|adaptaci|somatoform|somatiz") ~ "F4 Anxiety/Stress/Somatoform",
stringr::str_detect(x, "comportamiento.*fisiolog|alimentari|anorex|bulim|sueñ|insomni|disfuncion sexual") ~ "F5 Physio/Eating/Sleep/Sexual",
stringr::str_detect(x, stringr::regex("personalidad|comportamiento del adulto|antisocial|limite|evitativ|habit|habitos|impuls|control de los impulsos|control\\s+de\\s+impulsos", ignore_case = TRUE)) ~ "F6 Personality/Adult behaviour",
stringr::str_detect(x, "retraso mental|discapacidad intelectual|intelectual") ~ "F7 Intellectual disability",
stringr::str_detect(x, "desarrollo psicolog|autism|asperger|lenguaje|aprendizaje|espectro autista|tdah|t\\s*d\\s*a\\s*h") ~ "F8/9 Neurodevelopment/Child",
TRUE ~ "Other/unspecified"
)
}
dsmiv_broad <- function(x) {
x <- tolower(x)
tidytable::case_when(
stringr::str_detect(x, "sustancia|alcohol|marihuana|cannabis|cocain|opio|opiace|benzodiazep") ~ "Substance-related",
stringr::str_detect(x, "esquizofren|psicosis|psicot") ~ "Psychotic",
stringr::str_detect(x, "estado de animo|afectiv|depres|bipolar|maniaco|distimia|hipoman") ~ "Mood",
stringr::str_detect(x, "ansiedad|fobi|panico|obsesivo|compulsiv|estres|adaptaci") ~ "Anxiety/Stress/Adjustment",
stringr::str_detect(x, "somatoform|somatiz|disociativ|conversion") ~ "Somatoform/Dissociative",
stringr::str_detect(x, "alimentari|anorex|bulim|sueñ|insomni|disfuncion sexual|para[f|f]ilia|sexual") ~ "Eating/Sleep/Sexual",
stringr::str_detect(x, "personalidad|antisocial|limite|borderline|paranoide|evitativ|dependient|narcis") ~ "Personality",
stringr::str_detect(x, "retraso mental|discapacidad intelectual|intelectual") ~ "Intellectual disability",
stringr::str_detect(x, "desarrollo|autism|asperger|infancia|tdah|t\\s*d\\s*a\\s*h|lenguaje|aprendizaje") ~ "Neurodevelopment/Childhood-onset",
TRUE ~ "Other/unspecified"
)
}
pair_str <- function(main, sub) {
main <- as.character(main)
sub <- as.character(sub)
both_na <- is.na(main) & is.na(sub)
out <- paste0(
tidytable::coalesce(main, "NA"),
"::",
tidytable::coalesce(sub, "NA")
)
out[both_na] <- NA_character_
out
}
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
# IIW weight functions
ess <- function(w) {
w <- w[!is.na(w)]
cv2 <- var(w) / (mean(w)^2)
length(w) / (1 + cv2)
}
summ_w <- function(w, name, lo = 0.01, hi = 0.99) {
tibble::tibble(
#weight_set = name,
n_non_na = sum(!is.na(w)),
n_na = sum(is.na(w)),
min = min(w, na.rm=TRUE),
q01 = stats::quantile(w, probs=0.01, na.rm=TRUE),
q25 = stats::quantile(w, probs=0.25, na.rm=TRUE),
median = stats::median(w, na.rm=TRUE),
mean = mean(w, na.rm=TRUE),
q75 = stats::quantile(w, probs=0.75, na.rm=TRUE),
q99 = stats::quantile(w, probs=0.99, na.rm=TRUE),
max = max(w, na.rm=TRUE),
prop_lt_0.2= mean(w < 0.2, na.rm=TRUE),
prop_gt_5 = mean(w > 5, na.rm=TRUE),
ESS = ess(w)
)
}
# truncated versions (1st–99th) for sensitivity
cap <- function(w, p = c(0.01, 0.99)) {
b <- stats::quantile(w, probs=p, na.rm=TRUE)
pmin(pmax(w, b[1]), b[2])
}
renorm <- function(w) w / mean(w, na.rm = TRUE)
#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_
counting_process_output<-
function(object=NULL){
broom::tidy(object$m, exponentiate=T, conf.int=T) %>%
dplyr::mutate(across(c("estimate","std.error", "statistic","conf.low","conf.high"),~sprintf("%1.2f",.))) %>%
dplyr::mutate(p.value= sprintf("%1.4f",p.value))
}
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
tidy_coxph <- function(model, conf.level = 0.95) {
# Extract the coefficients (a named numeric vector)
coefs <- model$coefficients
# Compute robust standard errors from the robust variance matrix
robust_se <- sqrt(diag(model$var))
# Compute z statistics and two-sided p-values
z <- coefs / robust_se
p_value <- 2 * (1 - pnorm(abs(z)))
# Compute the critical z value for the specified confidence level
z_crit <- qnorm(1 - (1 - conf.level) / 2)
# Exponentiate coefficients to get hazard ratios
hazard_ratio <- exp(coefs)
# Compute the lower and upper confidence intervals (exponentiated)
lower_ci <- exp(coefs - z_crit * robust_se)
upper_ci <- exp(coefs + z_crit * robust_se)
# Build and return a tidy data frame
result <- data.frame(
term = names(coefs),
coef = coefs,
robust_se = robust_se,
hazard_ratio = hazard_ratio,
lower_ci = lower_ci,
upper_ci = upper_ci,
z = z,
p_value = p_value,
stringsAsFactors = FALSE
)
return(result)
}
format_hr_ci <- function(tidy_table, digits = 2) {
# Build a format string dynamically using the desired number of digits.
fmt <- paste0("%.", digits, "f (%.", digits, "f, %.", digits, "f)")
# Create the new column using sprintf to format each row.
tidy_table$hr_ci <- with(tidy_table, sprintf(fmt, hazard_ratio, lower_ci, upper_ci))
return(tidy_table)
}
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:Error in contrib.url(repos, "source") :
trying to use CRAN without setting a mirror
* pak version:
- 0.8.0.1
* Version information:
- pak platform: x86_64-w64-mingw32 (current: x86_64-w64-mingw32, compatible)
- pak repository: - (local install?)
* Optional packages installed:
- pillar
* Library path:
- G:/My Drive/Alvacast/SISTRAT 2023/renv/library/windows/R-4.4/x86_64-w64-mingw32
- C:/Program Files/R/R-4.4.1/library
* pak is installed at G:/My Drive/Alvacast/SISTRAT 2023/renv/library/windows/R-4.4/x86_64-w64-mingw32/pak.
* Dependency versions:
- callr 3.7.6
- cli 3.6.2
- curl 5.2.1
- desc 1.4.3
- filelock 1.0.3
- jsonlite 1.8.8
- lpSolve 5.6.23.9000
- pkgbuild 1.4.4
- pkgcache 2.2.2.9000
- pkgdepends 0.7.2.9000
- pkgsearch 3.1.3.9000
- processx 3.8.4
- ps 1.7.6
- R6 2.5.1
- zip 2.3.1
* Dependencies can be loaded
Install IrregLong
Install geepack
[1] Inf
Database consolidation
We used SISTRAT23_c1_2010_2024_df_prev1w instead of v16, because we wanted to capture more treatments from each people.
Code
invisible("Label variables")
# main=========
# =========================
# Core variables
# =========================
# Biopsychosocial compromise = biopsych_comp / severe = biopsych_comp_severe #created
# Age at Admission to Treatment (First Entry) = adm_age_rec3
# Birth year = birth_date_rec / year_decimal #created
# Primary substance of the initial diagnosis = sus_ini_mod_mvv
# Psychiatric comorbidity (ICD-10) = dg_psiq_cie_10_dg
# Psychiatric comorbidity (ICD-10) [In study] = dg_psiq_cie_10_instudy
# Broad diagnostic categories (e.g., Anxiety, Personality) = cie10_broad_cat
# Frequency of primary substance use at admission = prim_sub_freq
# Daily = created= prim_sub_daily
# Occupational status (corrected) = occupation_condition_corr24 # created
# Primary substance at admission to treatment = primary_sub
# Polysubstance = polysubstance_strict # created
# =========================
# Survival variables
# =========================
# Discharge date (numeric, rec6) = disch_date_num_rec6
# Discharge date (numeric, rec6, imputed/translated) = disch_date_num_rec6_trans
# Admission date of next treatment = adm_date_next_treat
# Admission date (numeric, rec2) = adm_date_num_rec2
# Treatment compliance (rec7) = tr_compliance_rec7 # created
# =========================
# Effect modifier
# =========================
# Plan type= plan_type_mod # created
# =========================
# Process variables
# =========================
# Treatment outcome of the previous treatment= tr_completion # created
# Previous biopsychosocial compromise (severe)
# Previous treatment duration (<90 days)
# Previous treatment duration (log days)
# Polysubstance use status of the previous treatment
# Age at admission to treatment (previous)
# Birth year (previous)
# Primary substance (initial diagnosis): alcohol = primary_sub
# Primary substance (initial diagnosis): cocaine
# Primary substance (initial diagnosis): cocaine base paste
# Primary substance (initial diagnosis): marijuana
# Psychiatric comorbidity (diagnosis unknown / under study)
# Psychiatric comorbidity (confirmed comorbidity)
# Daily frequency of primary substance at admission
# Occupational status (inactive) = occupation_condition_corr # created
# Occupational status (unemployed) = occupation_condition_corr # created
# Primary substance at admission to treatment (alcohol)
# Primary substance at admission to treatment (cocaine hydrochloride)
# Primary substance at admission to treatment (cocaine base paste)
# Primary substance at admission to treatment (marijuana)
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
# adapt database=========
sec_cols <- c("second_sub1", "second_sub2", "second_sub3")
out <- SISTRAT23_c1_2010_2024_df_prev1w %>%
left_join(SISTRAT23_c1_2010_2024_df_prev1t[,c("rn", "birth_date_rec", "dg_psiq_cie_10_instudy", "dg_psiq_cie_10_dg")], by="rn", multiple="first")%>%
arrange(hash_key, adm_age_rec3)%>%
mutate(
# Clean all substance columns
across(all_of(c("primary_sub", sec_cols)), ~ .x %>% stringr::str_squish() %>% stringr::str_to_lower()),
# Count real secondary substances (non-NA, non-empty, not "sin sustancia...")
n_real_secondary = rowSums(
across(all_of(sec_cols), ~
!is.na(.x) &
.x != "" &
!stringr::str_detect(.x, "sin\\s*sustancia\\s*principal")
),
na.rm = TRUE
),
# Define polysubstance logic:
# Case 1: primary_sub is NA → polysubstance if ≥2 real secondaries
# Case 2: primary_sub NOT NA → polysubstance if any secondary ≠ primary_sub
has_real_secondary = case_when(
is.na(primary_sub) ~ n_real_secondary >= 2,
TRUE ~ if_any(
all_of(sec_cols),
~ !is.na(.x) &
.x != "" &
!stringr::str_detect(.x, "sin\\s*sustancia\\s*principal") &
.x != primary_sub
)
),
polysubstance_strict = as.integer(has_real_secondary)
)%>%
mutate(
plan_code_std = plan_type_corr %>%
stringr::str_trim() %>% stringr::str_to_lower() %>% stringr::str_replace_all("[ _]", "-"),
plan_sponsor = case_when(
stringr::str_detect(plan_code_std, "^m-") ~ "WO",
stringr::str_detect(plan_code_std, "^pg-") ~ "GP",
TRUE ~ NA_character_
),
plan_level = case_when(
stringr::str_detect(plan_code_std, "pr$") ~ "residential",
stringr::str_detect(plan_code_std, "pai$") ~ "intensive ambulatory",
stringr::str_detect(plan_code_std, "pab$") ~ "basic ambulatory",
TRUE ~ NA_character_
),
# Optional combined label if you still want one
plan_type_mod = case_when(
!is.na(plan_sponsor) & !is.na(plan_level) ~ paste(plan_sponsor, plan_level),
!is.na(plan_level) ~ plan_level,
TRUE ~ NA_character_
),
plan_type_mod = case_when(plan_type_mod=="WO basic ambulatory"~ "WO intensive ambulatory",
TRUE ~ plan_type_mod
)
)%>%
group_by(hash_key)%>%
mutate(plan_type_mod = first(plan_type_mod))%>%
ungroup()%>%
mutate(
prim_sub_daily = case_when(
is.na(prim_sub_freq) ~ NA_integer_,
prim_sub_freq == "5. Daily" ~ 1L,
TRUE ~ 0L
)
)%>%
mutate(
tr_completion = case_when(
tr_compliance_rec7 == "completion" ~ 1L,
stringr::str_detect(tr_compliance_rec7, "current") ~ NA_integer_,
TRUE ~ 0L
),
# Non-completion (1) vs completion (0); keep NA for "currently in"
tr_noncompletion = case_when(
is.na(tr_completion) ~ NA_integer_,
tr_completion == 1L ~ 0L,
TRUE ~ 1L
)
)%>%
mutate(
primary_sub_std = stringr::str_to_lower(stringr::str_squish(primary_sub)),
primary_sub_mod = case_when(
stringr::str_detect(primary_sub_std, "cocaine\\s*(base\\s*)?paste|pasta\\s*base|\\bpaco\\b") ~ "cocaine paste",
stringr::str_detect(primary_sub_std, "cocaine\\s*(powder|hydrochloride|hcl)") |
stringr::str_detect(primary_sub_std, "clorhidrato\\s*de\\s*coca|clorhidrato|hidrocloruro") ~ "cocaine powder",
stringr::str_detect(primary_sub_std, "\\balcohol\\b") ~ "alcohol",
stringr::str_detect(primary_sub_std, "marij|cannab|marihu") ~ "marijuana",
TRUE ~ "others"
)) %>%
mutate(primary_sub_mod = factor(primary_sub_mod,
levels = c("cocaine paste","cocaine powder","alcohol","marijuana","others"))) %>%
mutate(treat_log_days = if_else(!is.na(dit_rec6), log1p(dit_rec6), NA_real_),
treat_lt_90 = if_else(!is.na(dit_rec6), as.integer(dit_rec6 < 90), NA_integer_))%>%# log(1+days)
mutate(biopsych_comp_severe = case_when(
biopsych_comp == "3-severe" ~ 1L,
biopsych_comp %in% c("1-mild", "2-moderate") ~ 0L,
TRUE ~ NA_integer_))%>% # keep NA if any missing)
mutate(dg_psiq_cie_10_instudy= if_else(dg_psiq_cie_10_dg==TRUE, FALSE, dg_psiq_cie_10_instudy))%>%
select(-any_of(c("primary_sub_std", "plan_code_std", "plan_sponsor", "plan_level", "n_real_secondary", "has_real_secondary", "tr_completion", "inconsistency_pair", "change_reason", "status_is_working", "occupation_condition_std", "occupation_status_std", "status_is_working", "ed_attainment_aux", "obs")))
# 2) Labels (these will be picked up by packages like table1/gtsummary; TableOne itself doesn’t use labels)
attr(out$tr_noncompletion, "label") <- "Non-completion status of treatment (Dropout / Misspelled)"
attr(out$biopsych_comp_severe, "label") <- "Biopsychosocial compromise (Severe)"
attr(out$treat_lt_90, "label") <- "Treatment duration (binary) (<90 days)"
attr(out$treat_log_days, "label") <- "Treatment duration (log-scaled days)"
attr(out$adm_age_rec3, "label") <- "Age at admission to treatment"
attr(out$birth_date_rec, "label") <- "Birth year"
# Primary substance (initial diagnosis) – label the variable; levels print as rows
attr(out$sus_ini_mod_mvv, "label") <- "Primary substance (initial diagnosis)"
# Psychiatric comorbidity
attr(out$dg_psiq_cie_10_instudy, "label") <- "Psychiatric comorbidity (ICD-10): In study"
attr(out$dg_psiq_cie_10_dg, "label") <- "Psychiatric comorbidity (ICD-10): Diagnosis"
# Frequency / Daily
attr(out$prim_sub_daily, "label") <- "Daily frequency of primary substance used at admission"
# Occupational status (levels like “Inactive”, “Unemployed” will print under this)
attr(out$occupation_condition_corr24,"label") <- "Occupational Status"
# Primary substance at admission – label the variable; levels print as rows
attr(out$primary_sub_mod, "label") <- "Primary substance at admission to treatment"
# Optional: quick audits mirroring your Excel pivots
audit1 <- out %>%
count(occupation_status_corr, occupation_condition_corr, name = "n") %>%
arrange(occupation_condition_corr, desc(n))
# Inactive/unemployed rows no longer carry a working status.
# Employed rows all have a status (with “other” as the fallback when missing).
out$year_decimal <- lubridate::year(out$birth_date_rec) +
(lubridate::yday(out$birth_date_rec) - 1) / 365.25
try(out$occupation_condition_inferred <- NULL)
# To keep the corrected data frame in the Global Environment as a named object:
assign("SISTRAT23_c1_2010_2024_df_prev1w_corr", out, envir = .GlobalEnv)
rm(out)
invisible("to explore")
# SISTRAT23_c1_2010_2024_df_prev1w_corr[, c("hash_key", "adm_age_rec3", "disch_date_num_rec6", "disch_date_num_rec6_trans", "adm_date_num_rec2", "tr_completion", "polysubstance_strict", "adm_disch_reason")]|> head(20)|> dput()
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
# DISCARD DATA
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
# ------------------------------------------------------------
# 0) Canonical ordering within each patient (hash_key)
# Primary sort: admission date; tie-breaker: transformed discharge date
# ------------------------------------------------------------
SISTRAT23_c1_2010_2024_df_prev1w_corr_ord <-
SISTRAT23_c1_2010_2024_df_prev1w_corr|>
tidytable::ungroup()|>
tidytable::arrange(hash_key, adm_age_rec3, disch_date_num_rec6_trans)
# ------------------------------------------------------------
# 1) Mark first treatment per hash_key (1 = first treatment, 0 = otherwise)
# ------------------------------------------------------------
SISTRAT23_c1_2010_2024_df_prev1w_corr_marked <-
SISTRAT23_c1_2010_2024_df_prev1w_corr|>
tidytable::group_by(hash_key)|>
tidytable::mutate(
initial_treatment = as.integer(tidytable::row_number() == 1L),
episodes= tidytable::n(),
)|>
tidytable::ungroup()
# ------------------------------------------------------------
# 2a) Drop the ENTIRE PATIENT if their FIRST treatment has:
# - adm_disch_reason == "death" OR
# - tr_compliance_rec7 == "currently in"
# ------------------------------------------------------------
first_flags <-
SISTRAT23_c1_2010_2024_df_prev1w_corr_marked|>
tidytable::mutate(
discard =
grepl("^death$", as.character(adm_disch_reason), ignore.case = TRUE) |
grepl("^currently\\s*in$", as.character(tr_compliance_rec7), ignore.case = TRUE)
)|>
tidytable::select(rn, discard)
message(paste0("Any treatment w/ death or ongoing tr: ", table(first_flags$discard)[[2]]))Code
SISTRAT_clean_drop_patient <-
SISTRAT23_c1_2010_2024_df_prev1w_corr_marked|>
tidytable::left_join(first_flags, by = "rn")|>
tidytable::filter(!tidytable::coalesce(discard, FALSE))|>
tidytable::select(-discard)
# ------------------------------------------------------------
# 3) Drop the ENTIRE PATIENT if only one treatment
# ------------------------------------------------------------
SISTRAT_clean_drop_patient_multiple<-
SISTRAT_clean_drop_patient|>
arrange(hash_key, adm_age_rec3)|>
#2025-09-22: count again
tidytable::group_by(hash_key)|>
tidytable::mutate(
initial_treatment = as.integer(tidytable::row_number() == 1L),
episodes= tidytable::n(),
)|>
tidytable::ungroup()|>
(\(df) {
(message(paste0("Only one tr: ", filter(df, episodes==1)|>pull(hash_key)|> length())))
filter(df, episodes==1)|>pull(hash_key)|> length() ->> hash_only_one_tr
df
})()|>
filter(episodes>1)|>
(\(df) {
(message(paste0("Multiple treatments, Entries: ", nrow(df))))
(message(paste0("Multiple treatments, RUNs: ", distinct(df, hash_key)|> nrow())))
nrow(df) ->> rows_final_db
distinct(df, hash_key)|> nrow() ->> hashs_final_db
df
})()Code
#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_
invisible("Function to format CreateTableOne into a database")
as.data.frame.TableOne <- function(x, ...) {capture.output(print(x,
showAllLevels = TRUE, varLabels = T,...) -> x)
y <- as.data.frame(x)
y$characteristic <- dplyr::na_if(rownames(x), "")
y <- y %>%
fill(characteristic, .direction = "down") %>%
dplyr::select(characteristic, everything())
rownames(y) <- NULL
y}
#_#_#_#_#_#_#_#_#_#_#_#_#_
# Pick the strata variable automatically
strata_var <- if ("polysubstance_strict" %in% names(SISTRAT_clean_drop_patient_multiple)) "polysubstance_strict" else "polysubstance"
# Variables of interest (as you listed)
vars_interest <- c(
"tr_noncompletion",
"biopsych_comp_severe", # Biopsychosocial compromise
"treat_lt_90",
"treat_log_days",
"adm_age_rec3", # Age at Admission (First Entry)
"year_decimal", # Birth year (assumed numeric or year-like)
"sus_ini_mod_mvv", # Primary substance of the initial diagnosis
"dg_psiq_cie_10_dg", # Psychiatric comorbidity (ICD-10)
"dg_psiq_cie_10_instudy", # Psychiatric comorbidity (ICD-10) [In study]
#"cie10_broad_cat", # Broad diagnostic categories
#"prim_sub_freq", # Frequency of primary substance use at admission
"prim_sub_daily", # Daily (0/1 you created)
"occupation_condition_corr24", # Occupational status (corrected)
"primary_sub_mod" # Primary substance at admission to treatment
)
# Which of those should be treated as categorical in TableOne
factorVars_interest <- c(
"biopsych_comp_severe",
"sus_ini_mod_mvv",
"dg_psiq_cie_10_dg",
"dg_psiq_cie_10_instudy",
#"cie10_broad_cat",
#"prim_sub_freq",
"prim_sub_daily", # treat 0/1 as categorical to show counts/%
"occupation_condition_corr24",
"primary_sub_mod",
"tr_noncompletion",
"treat_lt_90"
)
# 2) Labels (these will be picked up by packages like table1/gtsummary; TableOne itself doesn’t use labels)
attr(SISTRAT_clean_drop_patient_multiple$tr_noncompletion, "label") <- "Non-completion status of treatment (Dropout / Misspelled)"
attr(SISTRAT_clean_drop_patient_multiple$biopsych_comp_severe, "label") <- "Biopsychosocial compromise (Severe)"
attr(SISTRAT_clean_drop_patient_multiple$treat_lt_90, "label") <- "Treatment duration (binary) (<90 days)"
attr(SISTRAT_clean_drop_patient_multiple$treat_log_days, "label") <- "Treatment duration (log-scaled days)"
attr(SISTRAT_clean_drop_patient_multiple$adm_age_rec3, "label") <- "Age at admission to treatment"
attr(SISTRAT_clean_drop_patient_multiple$year_decimal, "label") <- "Birth year"
# Primary substance (initial diagnosis) – label the variable; levels print as rows
attr(SISTRAT_clean_drop_patient_multiple$sus_ini_mod_mvv, "label") <- "Primary substance (initial diagnosis)"
# Psychiatric comorbidity
attr(SISTRAT_clean_drop_patient_multiple$dg_psiq_cie_10_instudy, "label") <- "Psychiatric comorbidity (ICD-10): In study"
attr(SISTRAT_clean_drop_patient_multiple$dg_psiq_cie_10_dg, "label") <- "Psychiatric comorbidity (ICD-10): Diagnosis"
# Frequency / Daily
attr(SISTRAT_clean_drop_patient_multiple$prim_sub_daily, "label") <- "Daily frequency of primary substance used at admission"
# Occupational status (levels like “Inactive”, “Unemployed” will print under this)
attr(SISTRAT_clean_drop_patient_multiple$occupation_condition_corr24,"label") <- "Occupational Status"
# Primary substance at admission – label the variable; levels print as rows
attr(SISTRAT_clean_drop_patient_multiple$primary_sub_mod, "label") <- "Primary substance at admission to treatment"
# Optional: if birth_date_rec is a Date, turn into year (uncomment if needed)
# if (inherits(df$birth_date_rec, "Date")) {
# df <- df %>% mutate(birth_date_rec = as.integer(format(birth_date_rec, "%Y")))
# }
tbone_interest <- tableone::CreateTableOne(
vars = vars_interest,
data = tidytable::slice_head(SISTRAT_clean_drop_patient_multiple,n = 1, .by = hash_key)|> select(all_of(c(vars_interest, "polysubstance_strict"))),
factorVars = factorVars_interest,
strata = strata_var,
addOverall = TRUE,
includeNA = TRUE,
smd = TRUE,
test = TRUE
)
# Make a composite key characteristic+level for ordering
df_tbone <- as.data.frame.TableOne(
tbone_interest,
nonnormal = c("adm_age_rec3", "year_decimal", "treat_log_days"),
smd = TRUE
)
# Build desired order as a tibble with both characteristic and level
order_df <- tibble::tribble(
~characteristic, ~level,
"n", "",
"Non-completion status of treatment (Dropout / Misspelled) (%)", "0",
"Non-completion status of treatment (Dropout / Misspelled) (%)", "1",
"Biopsychosocial compromise (Severe) (%)", "0",
"Biopsychosocial compromise (Severe) (%)", "1",
"Biopsychosocial compromise (Severe) (%)", NA,
"Treatment duration (binary) (<90 days) (%)", "0",
"Treatment duration (binary) (<90 days) (%)", "1",
"Treatment duration (log-scaled days) (median [IQR])", "",
"Age at admission to treatment (median [IQR])", "",
"Birth year (median [IQR])", "",
"Primary substance (initial diagnosis) (%)", "cocaine powder",
"Primary substance (initial diagnosis) (%)", "cocaine paste",
"Primary substance (initial diagnosis) (%)", "marijuana",
"Primary substance (initial diagnosis) (%)", "alcohol",
"Primary substance (initial diagnosis) (%)", "others",
"Primary substance (initial diagnosis) (%)", NA,
"Psychiatric comorbidity (ICD-10): In study (%)", "TRUE",
"Psychiatric comorbidity (ICD-10): In study (%)", "FALSE",
"Psychiatric comorbidity (ICD-10): Diagnosis (%)", "TRUE",
"Daily frequency of primary substance used at admission (%)", "0",
"Daily frequency of primary substance used at admission (%)", "1",
"Daily frequency of primary substance used at admission (%)", NA,
"Occupational Status (%)", "inactive",
"Occupational Status (%)", "unemployed",
"Occupational Status (%)", "employed",
"Primary substance at admission to treatment (%)", "cocaine powder",
"Primary substance at admission to treatment (%)", "cocaine paste",
"Primary substance at admission to treatment (%)", "marijuana",
"Primary substance at admission to treatment (%)", "alcohol",
"Primary substance at admission to treatment (%)", "others"
)
# Join to assign sort order
df_tbone_ordered <- df_tbone %>%
left_join(order_df %>% mutate(order_id = dplyr::row_number()),
by = c("characteristic", "level")) %>%
arrange(order_id) %>%
select(-order_id)
df_tbone_ordered|>
filter(level!=0, level!=FALSE)|>
mutate(level=gsub("powder", "hydrochloride", level))|>
mutate(
Overall = gsub("\\(\\s+", "(", Overall),
`0` = gsub("\\(\\s+", "(", `0`),
`1` = gsub("\\(\\s+", "(", `1`)
)|>
select(characteristic, level, `0`, `1`, Overall, SMD)|>
tibble::as_tibble()|>
(\(df) {
format_cells(df,1, 1:length(names(df)), "bold")
})() |>
knitr::kable("markdown", "Descriptives")| characteristic | level | 0 | 1 | Overall | SMD |
|---|---|---|---|---|---|
| n | 5956 | 21489 | 27445 | ||
| Non-completion status of treatment (Dropout / Misspelled) (%) | 1 | 4323 (72.6) | 16909 (78.7) | 21232 (77.4) | |
| Biopsychosocial compromise (Severe) (%) | 1 | 1520 (25.5) | 8291 (38.6) | 9811 (35.7) | |
| Treatment duration (binary) (<90 days) (%) | 1 | 1196 (20.1) | 5329 (24.8) | 6525 (23.8) | |
| Treatment duration (log-scaled days) (median [IQR]) | 5.25 [4.65, 5.80] | 5.13 [4.51, 5.70] | 5.16 [4.53, 5.72] | 0.146 | |
| Age at admission to treatment (median [IQR]) | 38.56 [30.44, 47.58] | 31.66 [26.12, 38.65] | 32.82 [26.76, 40.69] | 0.625 | |
| Birth year (median [IQR]) | 1977.48 [1968.61, 1985.57] | 1982.91 [1975.73, 1988.85] | 1982.07 [1974.10, 1988.34] | 0.492 | |
| Primary substance (initial diagnosis) (%) | cocaine hydrochloride | 492 (8.3) | 1681 (7.8) | 2173 (7.9) | |
| Primary substance (initial diagnosis) (%) | cocaine paste | 789 (13.2) | 2312 (10.8) | 3101 (11.3) | 0.267 |
| Primary substance (initial diagnosis) (%) | marijuana | 533 (8.9) | 3766 (17.5) | 4299 (15.7) | |
| Primary substance (initial diagnosis) (%) | alcohol | 4064 (68.2) | 13428 (62.5) | 17492 (63.7) | |
| Primary substance (initial diagnosis) (%) | others | 41 (0.7) | 81 (0.4) | 122 (0.4) | |
| Psychiatric comorbidity (ICD-10): In study (%) | TRUE | 862 (14.5) | 3903 (18.2) | 4765 (17.4) | |
| Psychiatric comorbidity (ICD-10): Diagnosis (%) | TRUE | 2638 (44.3) | 10148 (47.2) | 12786 (46.6) | |
| Daily frequency of primary substance used at admission (%) | 1 | 2561 (43.0) | 10377 (48.3) | 12938 (47.1) | |
| Occupational Status (%) | inactive | 1240 (20.8) | 4214 (19.6) | 5454 (19.9) | |
| Occupational Status (%) | unemployed | 1758 (29.5) | 8263 (38.5) | 10021 (36.5) | |
| Occupational Status (%) | employed | 2958 (49.7) | 9012 (41.9) | 11970 (43.6) | 0.194 |
| Primary substance at admission to treatment (%) | cocaine hydrochloride | 810 (13.6) | 4789 (22.3) | 5599 (20.4) | |
| Primary substance at admission to treatment (%) | cocaine paste | 1720 (28.9) | 10507 (48.9) | 12227 (44.6) | 0.710 |
| Primary substance at admission to treatment (%) | marijuana | 189 (3.2) | 1383 (6.4) | 1572 (5.7) | |
| Primary substance at admission to treatment (%) | alcohol | 3138 (52.7) | 4435 (20.6) | 7573 (27.6) | |
| Primary substance at admission to treatment (%) | others | 99 (1.7) | 375 (1.7) | 474 (1.7) |
Code
# Labels
lab_orig <- paste0(
"Original C1 Dataset \n(n = ",
formatC(nrow(df23_ndp_20250824_SISTRAT23_c1_1024), format = "f", big.mark = ",", digits = 0),
"; Patients = ",
formatC(dplyr::n_distinct(df23_ndp_20250824_SISTRAT23_c1_1024$hash_key), format = "f", big.mark = ",", digits = 0),
")"
)
lab_orig_clean <- paste(
"Pre-processing & Quality Checks",
"• Remove exact duplicate records",
"• Resolve overlapping episodes (keep longest / merge)",
"• Fix inconsistent admission/discharge dates",
"• Correct implausible birth dates/ages",
"• Remove negative/implausible days in treatment",
"",
sep = "\\l"
)
lab_post_orig <- paste0('C1 Dataset \n(n = ', formatC(nrow(SISTRAT23_c1_2010_2024_df_prev1t), format='f', big.mark=',', digits=0), '; Patients = ',formatC(dplyr::distinct(SISTRAT23_c1_2010_2024_df_prev1t, hash_key)|> nrow(), format='f', big.mark=',', digits=0),')')
lab_post_orig_clean <- paste("• Collapse consecutive linked", "treatments (≤45d gap, referral=yes)", sep = "\\l")
lab_proc <- paste0('C1 Cleaned & Collapsed\n(n = ', formatC(nrow(SISTRAT23_c1_2010_2024_df_prev1w), format='f', big.mark=',', digits=0), '; Patients = ',formatC(dplyr::distinct(SISTRAT23_c1_2010_2024_df_prev1w, hash_key)|> nrow(), format='f', big.mark=',', digits=0),')')
lab_flag <- "Sorted by Admission Date\n+ First Treatment Flag"
lab_after <- "After First-Treatment Rule\nn = 169,936; Patients = 117,507" # example counts
lab_final <- paste0("Final C1 Dataset\nn = ",
formatC(rows_final_db, format = "f", digits = 0, big.mark = ","),
"; Patients = ",
formatC(hashs_final_db, format = "f", digits = 0, big.mark = ","))
lab_discard_first <- paste(
"Discard treatment episodes if:",
"• Administrative discharge reason= \\\"death\\\"",
"• Tr. compliance status = \\\"currently in\\\"",
"",
paste0("n = ", formatC(table(first_flags$discard)[[2]], format = "f", digits = 0, big.mark = ",")," put aside"),
"",
sep = "\\l"
)
lab_discard_single <- paste0(
"Discard patients with only ONE treatment\n",
"Patients = ", formatC(hash_only_one_tr, format = "f", big.mark = ",", digits = 0)
)
# Diagram
gr <- DiagrammeR::grViz(
paste0(
'digraph flowchart {
graph [layout = dot, rankdir = TB]
node [fontname = "Times", shape = rectangle, fontsize = 15, style = filled, fillcolor = white, ranksep=0.25, nodesep=0.25]
# New initial box and pre-clean note
pre_original [label = "', lab_orig, '", fillcolor = lightgray, shape = box]
pre_clean [label = "', lab_orig_clean, '", shape = note, fillcolor = white]
# Intermediate new initial box and pre-clean note
pre2_original [label = "', lab_post_orig, '", fillcolor = white, shape = box]
pre2_clean [label = "', lab_post_orig_clean, '", shape = note, fillcolor = lemonchiffon]
# Existing boxes
original [label = "', lab_proc, '", fillcolor = lightgray, shape = box]
marked [label = "', lab_flag, '", shape = box]
after_rule [label = "', lab_after, '", shape = box]
final_dataset [label = "', lab_final, '", fillcolor = lightgray, shape = box]
discard_first [label = "', lab_discard_first, '", shape = note, fillcolor = mistyrose]
discard_single [label = "', lab_discard_single, '", shape = note, fillcolor = mistyrose]
# Invisible points for alignment
v00 [shape = point, width = 0, style = invis] # between orig and post orig
v0 [shape = point, width = 0, style = invis] # between pre_original and original
v1 [shape = point, width = 0, style = invis]
v2 [shape = point, width = 0, style = invis]
v3 [shape = point, width = 0, style = invis]
# Main flow (add pre_original -> v0 -> original in front)
pre_original -> v00 [arrowhead = none]
v00 -> pre2_original
pre2_original -> v0 [arrowhead = none]
v0 -> original
original -> v1 [arrowhead = none]
v1 -> marked
marked -> v2 [arrowhead = none]
v2 -> after_rule
after_rule -> v3 [arrowhead = none]
v3 -> final_dataset
# Discard / note connections with solid black arrows
v00 -> pre_clean [color = black] # new pre-processing note
v0 -> pre2_clean [color = black] # new pre-processing note
v2 -> discard_first [color = black]
v3 -> discard_single[color = black]
# Alignment
{ rank = same; pre_clean; v00 }
{ rank = same; pre2_clean; v0 }
{ rank = same; discard_first; v2 }
{ rank = same; discard_single; v3 }
}'
),
width = 600, height = 900
)
grFigure 3. Flowchart
Code
unlink(paste0(gsub("/cons","",getwd()),"/cons/_figs/_flowchart_psu_files"), recursive = TRUE)
htmlwidgets::saveWidget(gr, paste0(gsub("/cons","",getwd()),"/cons/_figs/_flowchart_psu.html"))
webshot::webshot(paste0(gsub("/cons","",getwd()),"/cons/_figs/_flowchart_psu.html"),
paste0(gsub("/cons","",getwd()),"/cons/_figs/_flowchart_psu.png"),
vwidth = 300, vheight = 300*1.5, zoom=10, expand=100) # Prueba con diferentes coordenadas top, left, width, and heightWe compared the characteristics of people who used multiple substances and those who did not in the total database between 2010 and 2024.
Missingness
Code
paste0(
"Percentage of the total that has missing values: ",
scales::percent(
1 - (
SISTRAT_clean_drop_patient_multiple[
which(
complete.cases(
subset(
SISTRAT_clean_drop_patient_multiple,
select = c(vars_interest, "polysubstance_strict", "plan_type_mod")
)
)
),
] %>% nrow() / nrow(SISTRAT_clean_drop_patient_multiple)
),
accuracy = 0.1
),
"; total = ",
format(nrow(SISTRAT_clean_drop_patient_multiple), big.mark = ",")
)
#[1] "Percentage of the total that has missing values: 84.4%; total= 90,075"
# 76029
#SEP 2025
#[1] "Percentage of the total that has missing values: 2.4%; total = 83,174"
# Variables to check
vars_check <- c(vars_interest, "polysubstance_strict", "plan_type_mod")
# Compute % missing per variable
missing_pct <- colMeans(
is.na(subset(SISTRAT_clean_drop_patient_multiple, select = vars_check))
) * 100
# Round and format into data.frame
df_missing <- data.frame(
Variable = names(missing_pct),
Missing_Percent = round(missing_pct, 2)
)
# Display as markdown table
knitr::kable(df_missing, format = "markdown", align = c("l","r"),
col.names = c("Variable", "% Missing"))[1] "Percentage of the total that has missing values: 2.3%; total = 67,957"
| Variable | % Missing | |
|---|---|---|
| tr_noncompletion | tr_noncompletion | 0.00 |
| biopsych_comp_severe | biopsych_comp_severe | 1.16 |
| treat_lt_90 | treat_lt_90 | 0.00 |
| treat_log_days | treat_log_days | 0.00 |
| adm_age_rec3 | adm_age_rec3 | 0.00 |
| year_decimal | year_decimal | 0.00 |
| sus_ini_mod_mvv | sus_ini_mod_mvv | 0.69 |
| dg_psiq_cie_10_dg | dg_psiq_cie_10_dg | 0.00 |
| dg_psiq_cie_10_instudy | dg_psiq_cie_10_instudy | 0.00 |
| prim_sub_daily | prim_sub_daily | 0.51 |
| occupation_condition_corr24 | occupation_condition_corr24 | 0.00 |
| primary_sub_mod | primary_sub_mod | 0.00 |
| polysubstance_strict | polysubstance_strict | 0.00 |
| plan_type_mod | plan_type_mod | 0.00 |
Code
set.seed(2125)
# -----------------------------
# 0) Targets (ONLY these will be imputed)
# <- you already have `vars_interest`; we add the two extra vars you asked for
# -----------------------------
vars_impute <- unique(c(vars_interest, "polysubstance_strict", "plan_type_mod"))
# Keep only variables that actually exist to avoid errors
vars_impute <- intersect(vars_impute, names(SISTRAT_clean_drop_patient_multiple))
if (length(vars_impute) == 0) stop("None of the vars in vars_interest/polysubstance_strict/plan_type_mod exist in the data.")
# -----------------------------
# 1) Predictors (you can expand/trim this list as you prefer)
# Tip: include easy, generally-available covariates to make MAR more plausible
# -----------------------------
predictor_pool <- c(
# strong helpers already in your dataset
"adm_age_rec3", "year_decimal", "treat_log_days", "treat_lt_90",
"biopsych_comp_severe", "tr_noncompletion", "prim_sub_daily",
"primary_sub_mod", "occupation_condition_corr24",
"pub_center", "type_center", "episodes",
"sex_rec", "plan_type_mod", "primary_sub_std", "tr_completion",
"dg_psiq_cie_10_dg", "dg_psiq_cie_10_instudy",
"episodes", "age_subs_onset_rec2", "age_primary_onset_rec2",
"tenure_status_household", "macrozone_center", "adm_motive",
"marital_status", "ed_attainment", "sub_dep_icd10_status", "pub_center", "senda", "num_trat_ant",
"fecha_ultimo_tratamiento"
)
# Use only columns that exist
predictor_pool <- intersect(unique(c(predictor_pool, vars_impute)), names(SISTRAT_clean_drop_patient_multiple))
# Make sure we still have something on the RHS
if (length(predictor_pool) == 0) stop("No predictors found in the data. Please add some predictor names that exist.")
# -----------------------------
# 2) Build formula: (targets) ~ (predictors)
# missRanger will impute ONLY the LHS variables
# -----------------------------
lhs <- paste(vars_impute, collapse = " + ")
rhs <- paste(predictor_pool, collapse = " + ")
form_miss <- as.formula(paste(lhs, "~", rhs))
# -----------------------------
# 3) Arrange for reproducibility and run missRanger
# -----------------------------
dat_ord <- dplyr::arrange(SISTRAT_clean_drop_patient_multiple, hash_key, adm_date_num_rec2)
# columns to pass to missRanger = targets (impute) + predictors
cols_mr <- unique(c(vars_impute, predictor_pool))
# subset safely whether dat_ord is data.table/tidytable or data.frame
if (inherits(dat_ord, "data.table")) {
dat_sub <- dat_ord[, ..cols_mr]
} else {
# fallback for data.frame / tibble
dat_sub <- dat_ord[, cols_mr, drop = FALSE]
}
# (optional) force to plain data.frame to avoid method dispatch surprises
dat_sub <- as.data.frame(dat_sub)
SISTRAT_imputed_only <- missRanger::missRanger(
data = dat_sub, # << use the safe subset
formula = form_miss, # ONLY LHS vars in vars_impute will be imputed
num.trees = 100,
pmm.k = 5,
maxiter = 50,
returnOOB = TRUE,
verbose = 2,
seed = 2125
)
Variables to impute: prim_sub_daily, sus_ini_mod_mvv, biopsych_comp_severe
Variables used to impute: adm_age_rec3, year_decimal, treat_log_days, treat_lt_90, biopsych_comp_severe, tr_noncompletion, prim_sub_daily, primary_sub_mod, occupation_condition_corr24, episodes, sex_rec, plan_type_mod, dg_psiq_cie_10_dg, dg_psiq_cie_10_instudy, adm_motive, sub_dep_icd10_status, senda, sus_ini_mod_mvv, polysubstance_strict
prm_s_ ss_n__ bpsy__
iter 1: 0.8733 0.3432 0.7810
iter 2: 0.8394 0.3438 0.7789
iter 3: 0.8397 0.3444 0.7813
Code
# -----------------------------
# 4) Write imputed values back to your full data
# -----------------------------
# Make safe copies as data.frame
SISTRAT_clean_drop_patient_multiple_imp <- as.data.frame(dat_ord)
SISTRAT_imputed_only_df <- as.data.frame(SISTRAT_imputed_only)
# Regular data.frame column replacement works with a character vector of names
SISTRAT_clean_drop_patient_multiple_imp[vars_impute] <- SISTRAT_imputed_only_df[vars_impute]
# -----------------------------
# 5) OOB error summary → kable
# (missRanger stores per-target OOB error in the "oob" attribute)
# -----------------------------
oob_vec <- attr(SISTRAT_imputed_only, "oob")
if (!is.null(oob_vec)) {
df_oob <- data.frame(
Variable = names(oob_vec),
OOB_Error = round(as.numeric(oob_vec),3)
)
df_oob <- df_oob[order(df_oob$Variable), , drop = FALSE]
knitr::kable(df_oob, format = "markdown", align = c("l","r"),
col.names = c("Variable", "OOB error (1 - R² or class error)"))
} else {
message("No OOB error attribute returned by missRanger.")
}| Variable | OOB error (1 - R² or class error) | |
|---|---|---|
| 3 | biopsych_comp_severe | 0.779 |
| 1 | prim_sub_daily | 0.839 |
| 2 | sus_ini_mod_mvv | 0.344 |
Code
# | |Variable | OOB error (1 - R² or class error)|
# |:--|:--------------------|---------------------------------:|
# |4 |biopsych_comp_severe | 0.688|
# |1 |polysubstance_strict | 0.790|
# |2 |prim_sub_daily | 0.814|
# |3 |sus_ini_mod_mvv | 0.338|
#<0.2 good; 0.2–0.4 fair; 0.4–0.6 poor; >0.6 very poor for classification-type targets.
#OOB prediction error per iteration and variable (1 minus R-squared for regression)
#The default mtry in missRanger is sqrt(p), where p is the number of variables in the dataset.
#OOB prediction errors are quantified as 1 - R^2 for numeric variables, and as classification error otherwise. If a variable has been imputed only univariately, the value is 1.
#https://rdrr.io/cran/missRanger/man/missRanger.htmlWe formatted for survival setting.
Code
# Censor date you specified
censor_date <- as.Date("2025-05-28")
SISTRAT_traj <-
SISTRAT_clean_drop_patient_multiple_imp|>
tidytable::ungroup()|>
# Canonical ordering within each patient
tidytable::arrange(hash_key, adm_date_rec2, disch_date_rec6)|>
tidytable::group_by(hash_key)|>
# Treatment sequence and entry date
tidytable::mutate(
treatment = tidytable::row_number(),
is_first_occurrence = tidytable::if_else(treatment == 1, 1, 0),
max_treatment = max(treatment, na.rm = TRUE),
enter_date = min(adm_date_rec2, na.rm = TRUE)
)|>
tidytable::mutate(id = cumsum(is_first_occurrence))|>
tidytable::ungroup()|>
# Durations in MONTHS via lubridate::time_length (no 30.5 division)
tidytable::mutate(
time = lubridate::time_length(lubridate::interval(enter_date, disch_date_rec6), unit = "months"),
cens_time = lubridate::time_length(lubridate::interval(enter_date, censor_date), unit = "months")
)|>
# Tie-breaker for identical times within person: add small offsets 0.001, 0.002, ...
tidytable::mutate(dup_idx = tidytable::row_number(), .by = c(hash_key, time))|>
tidytable::mutate(time = time + (dup_idx - 1L) * 0.001)|>
# Lag (optional diagnostic)
tidytable::mutate(lag_time = tidytable::lag(time), .by = hash_key)|>
# Survival time: use censoring if no event (tr_noncompletion == 0)
tidytable::mutate(
surv_time = tidytable::if_else(tidytable::coalesce(tr_noncompletion, 0L) == 0L, cens_time, time),
id = as.numeric(factor(hash_key))
)|>
# Clean up helper
tidytable::select(-dup_idx, -is_first_occurrence)We constructed right‑censored survival times from the first recorded admission per patient (time origin), measuring follow‑up in months using calendar‑aware intervals (lubridate::interval + time_length). Episodes were ordered chronologically and minor within‑subject time ties were broken by adding negligible offsets (≤0.002 months) to ensure strictly increasing times. The event was defined as non‑completion of treatment (tr_noncompletion != 0); completions and ongoing treatments at the censoring date (2025‑05‑28) were right‑censored. A numeric patient id was generated from hash_key for modeling. Process is summarised in SISTRAT_traj database.
IrregLong
Abacus
Code
set.seed(21251)
IrregLong::abacus.plot(
25,
"time",
"id",
data = as.data.frame(SISTRAT_traj),
0,
120,
xlab.abacus = "Time (in months of follow-up)",
ylab.abacus = "Subject",
pch.abacus = 16,
col.abacus = 1
)Code
counts <- IrregLong::extent.of.irregularity(as.data.frame(SISTRAT_traj),
time="time",
id="id",
scheduledtimes=NULL, cutpoints=NULL,ncutpts=50,
maxfu=16*24, plot=F, legendx=30, legendy=0.8,
formula=survival::Surv(time.lag,time,event)~1,tau=12*12)
counts_counts<-
structure(c(0, 0, 1, 0.214519948988887, 0.419402441246129,
0.366077609764984, 0.385631869800206, 0.432780712941034, 0.18158741725876,
0.499799599198397, 0.394835124795045, 0.105365276006559, 0.579908908726544,
0.352370194935325, 0.0677208963381308, 0.638197607335884, 0.315455152729702,
0.0463472399344143, 0.682565130260521, 0.284079848007704, 0.0333550217317752,
0.71748496993988, 0.257524139187466, 0.0249908908726544, 0.745661019007712,
0.2350357280217, 0.0193032529705876, 0.768762980506468, 0.215991983967936,
0.0152450355255966, 0.788042200102685, 0.199701883105053, 0.0122559167922622,
0.804253962470395, 0.185829234226028, 0.00991680330357685, 0.818358395111902,
0.173381728491949, 0.00825987639614894, 0.830687348723421, 0.162212737162637,
0.00709991411394217, 0.841265561425882, 0.152763709236655, 0.0059707293374628,
0.850528329386045, 0.144491255237748, 0.00498041537620696, 0.858847105976659,
0.136909112342332, 0.00424378168100908, 0.866415659602032, 0.129819234428453,
0.00376510596951479, 0.873116568064358, 0.123627158623467, 0.00325627331217459,
0.879356895609401, 0.11764073601749, 0.00300236837310986, 0.884843279632859,
0.11254023197911, 0.00261648838803147, 0.889925305155766, 0.107711290349294,
0.00236340449494029, 0.894568583807932, 0.103314930255768, 0.00211648593629948,
0.898783931499362, 0.0993578065221352, 0.00185826197850246, 0.902723264711241,
0.0955991983967936, 0.00167753689196575, 0.906379191950334, 0.0920848690387769,
0.00153593901088891, 0.9097697077657, 0.0888241128722091, 0.00140617936209119,
0.9129245503995, 0.085783254821331, 0.00129219477916873, 0.915873125561468,
0.082926982491629, 0.00119989194690321, 0.918599623489403, 0.0803145685310014,
0.00108580797959555, 0.921176076493162, 0.0778225071844569, 0.00100141632238083,
0.923578976134086, 0.0755101111313536, 0.000910912734560029,
0.925866057183237, 0.0732771327779526, 0.000856810038810403,
0.928000385798335, 0.0712119426017811, 0.000787671599884261,
0.930049189287666, 0.0691908492309294, 0.000759961481404367,
0.931936600473675, 0.0673792028501447, 0.000684196676180644,
0.933738730532318, 0.0656300315618953, 0.000631237905787004,
0.935448888207036, 0.0639690864983556, 0.000582025294608356,
0.937097505033377, 0.062341933283817, 0.000560561682806172, 0.938649116414647,
0.0608261978502459, 0.000524685735106577, 0.940133037694013,
0.059365738128141, 0.000501224177845714, 0.941533283016249, 0.0579991151133436,
0.000467601870407482, 0.942889584666161, 0.0566553826469006,
0.000455032686938359, 0.94417347090876, 0.0553950876960533, 0.000431441395187068,
0.94538734033724, 0.0542175259610129, 0.000395133701746928, 0.946575364167069,
0.0530357157001751, 0.00038892013275563, 0.94768957644496, 0.0519530356651407,
0.000357387889899722, 0.948758122305217, 0.050912430922451, 0.000329446772332544,
0.949789746468819, 0.0499009150025468, 0.000309338528634263,
0.950787392967754, 0.0489152851156859, 0.000297321916560394), dim = c(3L,
50L))
# counts$auc
# [1] 0.2248806
# > counts$transformed.auc
# [1] 86.18703
#→ 86 suggests substantial irregularity in observation times across subjects.
plot_irregularity_05_2024<-
rbind.data.frame(cbind.data.frame(time= 1:50,data.frame(t(counts_counts)))) %>%
reshape2::melt(id=c("time")) %>%
dplyr::mutate(Observations= factor(variable, labels=c("0 obs per bin","1 obs per bin", "2+ obs per bin"))) %>%
ggplot(aes(x=time, y=value, linetype=Observations), linewidth=1)+
geom_line()+
#facet_wrap(~type)+ #color=factor(type))
theme_classic()+
xlab("Bin width (%)")+
ylab("Mean proportions of individuals")#+
ggsave(plot_irregularity_05_2024, file= paste0(wdpath,"cons/_figs/irreg250921.jpg"), width=8.5, height=5.5, dpi=500)The irregularity metrics indicate a meaningful departure from perfectly scheduled repeated measures: although the raw AUC is modest (≈0.225), its transformed value (≈86) points to substantial variability in visit timing. This supports modeling strategies that explicitly accommodate irregular observation times and potential informativeness of assessment schedules—conditionally on a rich set of covariates, including baseline factors, prior outcomes, and visit history. The figure displays mean proportions of subjects with 0, 1, and ≥2 visits per bin as the bin width ranges from 1% to 50% of the inter-admission interval. Notably, in the earliest bins (roughly the first five), multiple observations frequently fall within the same time bin.
Longitudinal formatting
We creates dummy variables (binary 0/1 indicators) from categorical variables in the dataset sdat, for occupation status, starting/initial and primary substance.
We also centered the year variable around 2015 (year_c), making 2015 the baseline (value = 0), which improves coefficient interpretation and reduces multicollinearity. Second, we standardized age by centering it at the mean and scaling by 10 (age_c10), so each unit represents a 10-year difference from the average age—making effects easier to interpret per decade and improving numerical stability. Together, these transformations create meaningful reference points, aid in comparing effect sizes, and support more robust statistical modeling.
Code
# clean memory
gc()
# --- Prepare data: plain data.frame for IrregLong ---
sdat <- as.data.frame(SISTRAT_traj)
# ID as factor; time must be numeric; create visit indicator (every row is a visit)
sdat$id <- as.numeric(factor(sdat$hash_key))
sdat$time <- as.numeric(sdat$time)
sdat$event_vis <- 1L
# Dummified vars
sdat$occ24_inactive <- as.integer(sdat$occupation_condition_corr24 == "inactive")
sdat$occ24_unemployed <- as.integer(sdat$occupation_condition_corr24 == "unemployed")
sdat$susinidum_oh <- as.integer(sdat$sus_ini_mod_mvv == "alcohol")
sdat$susinidum_coc <- as.integer(sdat$sus_ini_mod_mvv == "cocaine powder")
sdat$susinidum_pbc <- as.integer(sdat$sus_ini_mod_mvv == "cocaine paste")
sdat$susinidum_mar <- as.integer(sdat$sus_ini_mod_mvv == "marijuana")
sdat$susprindum_oh <- as.integer(sdat$primary_sub_mod == "alcohol")
sdat$susprindum_coc <- as.integer(sdat$primary_sub_mod == "cocaine powder")
sdat$susprindum_pbc <- as.integer(sdat$primary_sub_mod == "cocaine paste")
sdat$susprindum_mar <- as.integer(sdat$primary_sub_mod == "marijuana")
sdat$year_c <- sdat$year_decimal - 2015
sdat$age_c10 <- (sdat$adm_age_rec3 - mean(sdat$adm_age_rec3, na.rm=TRUE)) / 10
#Ensure treat_log_days exists, is numeric, and finite
#keep NA for non-positive so the lagger can handle it
stopifnot(sdat$treat_log_days[!is.finite(sdat$treat_log_days)]|> length()==0)
sdat$treat_log_days[!is.finite(sdat$treat_log_days)] <- NA_real_ used (Mb) gc trigger (Mb) max used (Mb)
Ncells 4363067 233.1 6436229 343.8 6436229 343.8
Vcells 187094595 1427.5 383112096 2923.0 383110494 2923.0
Intercept-only and general weights
Code
cat("Survival time\n")
summary(sdat$cens_time)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 6.742 90.161 124.871 121.133 157.355 304.871
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 7.452 95.484 129.484 125.146 159.806 304.871
# Keep your treatment outcome separately (edit if you want a different outcome)
# Here, 1 = noncompletion/dropout; 0 = otherwise
sdat$tr_outcome <- as.integer(sdat$tr_noncompletion)
# --- Max follow-up per subject (use your cens_time in MONTHS) ---
maxfu_df <- aggregate(cens_time ~ id, data = sdat, FUN = function(x) max(x, na.rm = TRUE))
names(maxfu_df) <- c("maxfu.id", "maxfu.time")
# --- Visit-intensity model with inverse-intensity weights ---
# The function will internally create time.lag and tr_outcome.lag because we list them in `lagvars`
i0 <- IrregLong::iiw.weights(
formulaph = survival::Surv(time.lag, time, event_vis) ~ tr_outcome.lag + survival::cluster(id),
# If you want stabilized weights, uncomment and provide a null model (often just cluster):
# formulanull = ~ survival::cluster(id),
data = sdat,
id = "id",
time = "time",
event = "event_vis", # each row is a visit
lagvars = c("time", "tr_outcome"),
invariant = "id",
maxfu = maxfu_df, # per-ID maximum follow-up (same units as 'time': months)
lagfirst = c(0, 0), # time.lag = 0 at first visit; tr_outcome.lag = 0 at first visit
first = TRUE # baseline visit observed with prob = 1
)Warning in survival::Surv(time.lag, time, event_vis): Stop time must be > start time, NA created
Code
# Model summary for the visit-intensity fit
i0$m
# coxph(formula = formulaph, data = datacox)
#
# coef exp(coef) se(coef) z p
# tr_outcome.lag -6.883e-01 5.025e-01 8.237e-03 -83.560 <2e-16
# survival::cluster(id) 3.000e-07 1.000e+00 4.848e-07 0.619 0.536
#
# Likelihood ratio test=6858 on 2 df, p=< 2.2e-16
# n= 95355, number of events= 67912
# (45 observations deleted due to missingness)
mf_all <- stats::model.frame(i0$m, data = i0$datacox, na.action = stats::na.pass)Warning in survival::Surv(time.lag, time, event_vis): Stop time must be > start time, NA created
Code
keep_idx <- stats::complete.cases(mf_all)
drop_idx <- which(!keep_idx)
# i0$datacox[which(!keep_idx), c("id","time","time.lag","event_vis","tr_outcome","tr_outcome.lag","cens_time")]
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
model_vars <- c(
"time", "event_vis", # Survival components
"tr_noncompletion",
"biopsych_comp_severe",
"treat_lt_90",
"treat_log_days",
"polysubstance_strict",
"age_c10",
"year_c",
"susinidum_oh",
"susinidum_coc",
"susinidum_pbc",
"susinidum_mar",
"dg_psiq_cie_10_instudy",
"dg_psiq_cie_10_dg",
"prim_sub_daily",
"occ24_inactive",
"occ24_unemployed",
"susprindum_oh",
"susprindum_coc",
"susprindum_pbc",
"susprindum_mar",
"id"
)#age_c10, year_c
cat("Filtered without events with ties")
sdat_filt<- subset(sdat, !id %in% i0$datacox[which(!keep_idx),"id"])
maxfu_df_filt <- subset(maxfu_df, !maxfu.id %in% i0$datacox[which(!keep_idx),"id"])
cat("Mean positive days for lag treat_log_days at 90 days\n")
filter(sdat, dit_rec6==90)|> select(dit_rec6, treat_log_days)|> pull(treat_log_days)|> mean()
i1_adj <- IrregLong::iiw.weights(
formulaph = survival::Surv(time.lag,time,event_vis)~ tr_noncompletion.lag+ biopsych_comp_severe.lag+ treat_lt_90.lag+ treat_log_days.lag+ polysubstance_strict.lag+ age_c10+ year_c+ susinidum_oh+ susinidum_coc+ susinidum_pbc+ susinidum_mar+ dg_psiq_cie_10_instudy+ dg_psiq_cie_10_dg+ prim_sub_daily+ occ24_inactive+ occ24_unemployed+ susprindum_oh+ susprindum_coc+ susprindum_pbc+ susprindum_mar+ survival::cluster(id),
# If you want stabilized weights, uncomment and provide a null model (often just cluster):
# formulanull = ~ survival::cluster(id),
data = sdat,#[,model_vars],
id = "id",
time = "time",
event = "event_vis", # each row is a visit
lagvars= c("time", "tr_noncompletion", "biopsych_comp_severe", "treat_lt_90", "treat_log_days", "polysubstance_strict"),
invariant = "id",
maxfu = maxfu_df, # per-ID maximum follow-up (same units as 'time': months)
lagfirst = c(5.149111,1,1,1,4.51086,1), # time.lag = 0 at first visit; tr_outcome.lag = 0 at first visit #90/30.5 4.51086 es 90 días
first = TRUE # baseline visit observed with prob = 1
)Warning in survival::Surv(time.lag, time, event_vis): Stop time must be > start time, NA created
Code
# Call iiw.weights w/0s
i1_adj_0 <- IrregLong::iiw.weights(
formulaph = survival::Surv(time.lag, time, event_vis) ~
tr_noncompletion.lag + biopsych_comp_severe.lag + treat_lt_90.lag +
treat_log_days.lag + polysubstance_strict.lag +
age_c10+ year_c+
susinidum_oh + susinidum_coc + susinidum_pbc + susinidum_mar +
dg_psiq_cie_10_instudy + dg_psiq_cie_10_dg + prim_sub_daily +
occ24_inactive + occ24_unemployed +
susprindum_oh + susprindum_coc + susprindum_pbc + susprindum_mar +
survival::cluster(id),
data = sdat,
id = "id",
time = "time",
event = "event_vis",
# include *all* variables you want lagged (and in the same order for lagfirst)
lagvars = c("time", "tr_noncompletion", "biopsych_comp_severe",
"treat_lt_90", "treat_log_days", "polysubstance_strict"),
invariant = "id",
maxfu = maxfu_df,
# use zeros to avoid integer/double coercion warnings
lagfirst = c(5.149111/2, 0, 0, 0, 2.772589, 0),
first = TRUE
)Warning in survival::Surv(time.lag, time, event_vis): Stop time must be > start time, NA created
Code
rbind.data.frame(
cbind.data.frame(model= "Primary (after PH, lag=0)", t(matrix(summary(i1_adj_0$iiw.weight))) ),
cbind.data.frame(model= "Alternative (after PH, lag=1)", t(matrix(summary(i1_adj$iiw.weight)))) )%>%
{
knitr::kable(., digits=2, size=10, format="markdown", caption="Weights (no correction for PHs)", col.names= c("Weight",attr(summary(i1_adj$iiw.weight),"names")) )
}
# |Weight | Min.| 1st Qu.| Median| Mean| 3rd Qu.| Max.| NA's|
# |:-----------------------------|----:|-------:|------:|----:|-------:|----:|----:|
# |Primary (after PH, lag=0) | 0.17| 1.00| 1.11| 1.58| 1.97| 9.03| 982|
# |Alternative (after PH, lag=1) | 0.09| 0.45| 0.86| 0.74| 1.00| 2.35| 982|
# |Weight | Min.| 1st Qu.| Median| Mean| 3rd Qu.| Max.|
# |:-----------------------------|----:|-------:|------:|----:|-------:|----:|
# |Primary (after PH, lag=0) | 0.16| 1.00| 1.09| 1.55| 1.92| 8.78|
# |Alternative (after PH, lag=1) | 0.08| 0.47| 0.93| 0.76| 1.00| 2.56|Survival time
Min. 1st Qu. Median Mean 3rd Qu. Max.
7.452 95.484 129.484 125.146 159.806 304.871
Call:
coxph(formula = formulaph, data = datacox)
coef exp(coef) se(coef) z p
tr_outcome.lag -6.883e-01 5.025e-01 8.237e-03 -83.560 <2e-16
survival::cluster(id) 3.000e-07 1.000e+00 4.848e-07 0.619 0.536
Likelihood ratio test=6858 on 2 df, p=< 2.2e-16
n= 95355, number of events= 67912
(45 observations deleted due to missingness)
Filtered without events with tiesMean positive days for lag treat_log_days at 90 days
[1] 4.51086
| Weight | Min. | 1st Qu. | Median | Mean | 3rd Qu. | Max. |
|---|---|---|---|---|---|---|
| Primary (after PH, lag=0) | 0.16 | 1.00 | 1.09 | 1.55 | 1.92 | 8.77 |
| Alternative (after PH, lag=1) | 0.08 | 0.47 | 0.93 | 0.76 | 1.00 | 2.56 |
We built patient trajectories in SISTRAT by ordering episodes within each hash_key, labeling sequential treatments, and defining each person’s entry date. Using lubridate::time_length() on interval(enter_date, disch_date) and interval(enter_date, 2025-05-28), we computed follow-up times in months (time, cens_time), then resolved ties within subjects by adding tiny offsets so repeated times don’t collide. We set the survival time (surv_time) to time for events and to cens_time otherwise (event proxied by tr_noncompletion). To examine visit irregularity, we used IrregLong’s diagnostics and summarized the proportions with 0, 1, and ≥2 visits across time bins. Finally, we fitted a visit-intensity (counting-process) model with IrregLong::iiw.weights()—treating each row as a visit, lagging key covariates internally, and supplying per-subject maximum follow-up—to obtain inverse-intensity weights for downstream outcome analyses that account for irregular and potentially informative visit times.
The visit‐intensity Cox model shows that the lagged treatment outcome is strongly associated with when the next visit is observed: conditional on the model—the instantaneous rate of being observed at a visit is about 64% lower following a prior non-completion event. This provides clear evidence of informative visit times, justifying the use of inverse-intensity weights. The overall likelihood ratio test is highly significant (χ² ≈ 6858 on 2 df), with 95,355 intervals and 67,912 visit events (45 rows dropped for missingness).
Weights by plan type
We fixed 0-length intervals in dat database. Those rows contribute no person-time. We winsorized and renormalized weight values. This was done by restricting values below the 2.5th percentile to that percentile, and values above the 97.5th to the 97.5th percentile.
Code
#Great—those 45 rows are exactly the ones with zero-length intervals: time == 0 and time.lag == 0 (baseline rows with event_vis == 1). In a counting-process Surv(start, stop, event), stop must be strictly greater than start, so these get dropped.
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#Fix: precompute a baseline-safe lag for the covariate yourself and don’t ask IrregLong to re-lag it. Only let IrregLong lag time. This avoids baseline NAs and keeps rows.
## Start from i0$datacox
dat <- data.table::as.data.table(sdat) # ensure data.table
## Sort
data.table::setorder(dat, id, time)
## Make intervals strictly positive & ordered
eps <- 1e-3
dat[, .idx := seq_len(.N) - 1L, by = .(id, time)]
dat[, time := time + eps * .idx][, .idx := NULL]
dat[, time.lag := data.table::shift(time, type = "lag", fill = 0), by = .(id)]
dat[time <= time.lag, time := time.lag + eps]
## Lagged covariate with baseline = 0 (no NAs on first row)
dat[, tr_outcome_lag0 := data.table::shift(tr_outcome, type = "lag"), by = .(id)]
dat[is.na(tr_outcome_lag0), tr_outcome_lag0 := 0L]
## Per-id max follow-up (keep id types consistent)
dat[, id := as.character(id)]
maxfu_df <- dat[, .(maxfu.time = max(cens_time, na.rm = TRUE)),
by = .(maxfu.id = id)]
dat[, id := as.character(id)]
data.table::setDF(dat); maxfu_df <- as.data.frame(maxfu_df)
## Refit: only let IrregLong lag time; use our tr_outcome_lag0 in the formula
i0_3 <- IrregLong::iiw.weights(
formulaph = survival::Surv(time.lag, time, event_vis) ~ tr_outcome_lag0 + survival::cluster(id),
data = dat,
id = "id",
time = "time",
event = "event_vis",
lagvars = c("time"), # we already created tr_outcome_lag0
invariant = "id",
maxfu = maxfu_df,
lagfirst = c(0), # first time.lag = 0
first = TRUE
)Warning in survival::Surv(time.lag, time, event_vis): Stop time must be > start time, NA created
Code
## Sanity check: how many rows would coxph drop now?
mf3 <- stats::model.frame(i0_3$m, data = i0_3$datacox, na.action = stats::na.pass)Warning in survival::Surv(time.lag, time, event_vis): Stop time must be > start time, NA created
Code
sum(!stats::complete.cases(mf3)) # should be ~0 (or just the true oddballs), not ~31928
table(is.na(i0_3$iiw.weight))
invisible("didnt change, despite no more NAs in weights")
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
cat("Mean positive days for lag treat_log_days at 15 days\n")
filter(sdat, dit_rec6==15)|> select(dit_rec6, treat_log_days)|> pull(treat_log_days)|> mean()
cat("$:$:$:$:$:$:$:$:$:$:$:$:$:$:\n")
cat("Weights by plan type\n")
cat("$:$:$:$:$:$:$:$:$:$:$:$:$:$:\n")
plan_types <- unique(dat$plan_type_mod)
iiw_by_plan0 <- lapply(plan_types, function(pt) {
cat("Running for:", pt, "\n")
# Subset as plain data.frame
dat_sub <- dat |> filter(plan_type_mod == pt) |> as.data.frame()
# Subset maxfu_df to only the ids in dat_sub
maxfu_sub <- maxfu_df[maxfu_df$maxfu.id %in% dat_sub$id, , drop = FALSE]
IrregLong::iiw.weights(
formulaph = survival::Surv(time.lag, time, event_vis) ~
tr_noncompletion.lag + biopsych_comp_severe.lag +
treat_lt_90.lag + treat_log_days.lag + polysubstance_strict.lag +
age_c10 + year_c +
susinidum_oh + susinidum_coc + susinidum_pbc + susinidum_mar +
dg_psiq_cie_10_instudy + dg_psiq_cie_10_dg + prim_sub_daily +
occ24_inactive + occ24_unemployed +
susprindum_oh + susprindum_coc + susprindum_pbc + susprindum_mar +
survival::cluster(id),
data = dat_sub,
id = "id",
time = "time",
event = "event_vis",
lagvars = c("time", "tr_noncompletion", "biopsych_comp_severe",
"treat_lt_90", "treat_log_days", "polysubstance_strict"),
invariant = "id",
maxfu = maxfu_sub, # stratified by subset IDs
lagfirst = c(5.149111, 0, 0, 0, 2.772589, 0),
first = TRUE
)
})Warning in survival::Surv(time.lag, time, event_vis): Stop time must be > start time, NA created
Warning in survival::Surv(time.lag, time, event_vis): Stop time must be > start time, NA created
Warning in survival::Surv(time.lag, time, event_vis): Stop time must be > start time, NA created
Warning in survival::Surv(time.lag, time, event_vis): Stop time must be > start time, NA created
Warning in survival::Surv(time.lag, time, event_vis): Stop time must be > start time, NA created
Code
names(iiw_by_plan0) <- plan_types
iiw_by_plan1 <- lapply(plan_types, function(pt) {
cat("Running for:", pt, "\n")
# Subset as plain data.frame
dat_sub <- dat |> filter(plan_type_mod == pt) |> as.data.frame()
# Subset maxfu_df to only the ids in dat_sub
maxfu_sub <- maxfu_df[maxfu_df$maxfu.id %in% dat_sub$id, , drop = FALSE]
IrregLong::iiw.weights(
formulaph = survival::Surv(time.lag, time, event_vis) ~
tr_noncompletion.lag + biopsych_comp_severe.lag +
treat_lt_90.lag + treat_log_days.lag + polysubstance_strict.lag +
age_c10 + year_c +
susinidum_oh + susinidum_coc + susinidum_pbc + susinidum_mar +
dg_psiq_cie_10_instudy + dg_psiq_cie_10_dg + prim_sub_daily +
occ24_inactive + occ24_unemployed +
susprindum_oh + susprindum_coc + susprindum_pbc + susprindum_mar +
survival::cluster(id),
data = dat_sub,
id = "id",
time = "time",
event = "event_vis",
lagvars = c("time", "tr_noncompletion", "biopsych_comp_severe",
"treat_lt_90", "treat_log_days", "polysubstance_strict"),
invariant = "id",
maxfu = maxfu_sub, # stratified by subset IDs
lagfirst = c(5.149111, 1, 1, 1, 4.51086, 1),
first = TRUE
)
})Warning in survival::Surv(time.lag, time, event_vis): Stop time must be > start time, NA created
Warning in survival::Surv(time.lag, time, event_vis): Stop time must be > start time, NA created
Warning in survival::Surv(time.lag, time, event_vis): Stop time must be > start time, NA created
Warning in survival::Surv(time.lag, time, event_vis): Stop time must be > start time, NA created
Warning in survival::Surv(time.lag, time, event_vis): Stop time must be > start time, NA created
Code
names(iiw_by_plan1) <- plan_types
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
cat("Extract weights\n")
extract_weights <- function(iiw_by_plan, dat,
weight_name = "iiw.weight",
out_col = "iiw_weight",
na_to_one = TRUE) {
weights_list <- lapply(names(iiw_by_plan), function(pt) {
mod <- iiw_by_plan[[pt]]
dat_sub <- dat[dat$plan_type_mod == pt, , drop = FALSE]
# Check weight existence
if (!weight_name %in% names(mod)) {
stop(sprintf("Weight '%s' not found in model for plan type '%s'",
weight_name, pt))
}
w <- mod[[weight_name]]
if (length(w) != nrow(dat_sub)) {
stop(sprintf("Row mismatch for %s: %d weights vs %d rows",
pt, length(w), nrow(dat_sub)))
}
if (na_to_one) {
w[is.na(w)] <- 1
}
dat_sub[[out_col]] <- w
dat_sub
})
do.call(rbind, weights_list)
}
dat_with_weights_stab <- extract_weights(iiw_by_plan0, dat,
weight_name = "iiw.weight",
out_col = "iiw_weight0_stab")
dat_with_weights_stab1 <- extract_weights(iiw_by_plan1, dat,
weight_name = "iiw.weight",
out_col = "iiw_weight1_stab")
dat_with_weights_stab_cons<-
dat_with_weights_stab|> #67957
left_join(dat_with_weights_stab1[,c("rn","iiw_weight1_stab")], by="rn")|>
arrange(hash_key, adm_age_rec3)
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
cat("Add weight 1 in case of first treatment for time.lag==0\n")
table(subset(dat_with_weights_stab_cons, treatment==1)$time.lag)
dat_with_weights_stab_cons$iiw_weight0_stab <- ifelse(
is.na(dat_with_weights_stab_cons$iiw_weight0_stab) & dat_with_weights_stab_cons$time.lag == 0,
1,
dat_with_weights_stab_cons$iiw_weight0_stab
)
dat_with_weights_stab_cons$iiw_weight1_stab <- ifelse(
is.na(dat_with_weights_stab_cons$iiw_weight1_stab) & dat_with_weights_stab_cons$time.lag == 0,
1,
dat_with_weights_stab_cons$iiw_weight1_stab
)
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
rbind.data.frame(
cbind.data.frame(model= "Primary (after PH, lag=0)",
t(matrix(summary(dat_with_weights_stab_cons$iiw_weight0_stab ))) ),
cbind.data.frame(model= "Alternative (after PH, lag=1)", t(matrix(summary(dat_with_weights_stab_cons$iiw_weight1_stab)))) )%>%
{
knitr::kable(., digits=2, size=10, format="markdown", caption="Weights (no correction for PHs)", col.names= c("Weight",attr(summary(dat_with_weights_stab_cons$iiw_weight1_stab),"names")) )
}
w_primary_tr <- cap(dat_with_weights_stab_cons$iiw_weight0_stab, c(0.025, 0.975))
w_alt_tr <- cap(dat_with_weights_stab_cons$iiw_weight1_stab, c(0.025, 0.975))
w0 <- dat_with_weights_stab_cons$iiw_weight0_stab
w0_tr <- renorm(cap(w0, c(0.025, 0.975)))
w1 <- dat_with_weights_stab_cons$iiw_weight1_stab
w1_tr <- renorm(cap(w1, c(0.025, 0.975)))
cbind.data.frame(type= c("Lag=0, raw","lag=0 trunc 2.5–97.5%","lag=1 raw","lag=1 trunc 2.5–97.5%"),
dplyr::bind_rows(
summ_w(w0, "lag=0 raw"),
summ_w(w0_tr, "lag=0 trunc 2.5–97.5%"),
summ_w(w1, "lag=1 raw"),
summ_w(w1_tr, "lag=1 trunc 2.5–97.5%")
))|> knitr::kable("markdown", digits=2, caption= "Weights, stabilized and truncated")
# |type | n_non_na| n_na| min| q01| q25| median| mean| q75| q99| max| prop_lt_0.2| prop_gt_5| ESS|
# |:---------------------|--------:|----:|----:|----:|----:|------:|----:|----:|----:|----:|-----------:|---------:|--------:|
# |Lag=0, raw | 67957| 0| 0.19| 0.36| 1.00| 1.00| 1.06| 1.00| 2.67| 7.05| 0| 0| 59450.19|
# |lag=0 trunc 2.5–97.5% | 67957| 0| 0.42| 0.42| 0.95| 0.95| 1.00| 0.95| 2.11| 2.11| 0| 0| 61803.67|
# |lag=1 raw | 67957| 0| 0.15| 0.35| 1.00| 1.00| 1.07| 1.00| 2.82| 8.25| 0| 0| 58250.96|
# |lag=1 trunc 2.5–97.5% | 67957| 0| 0.41| 0.41| 0.95| 0.95| 1.00| 0.95| 2.14| 2.14| 0| 0| 61572.35|
# INTEGRATE!
dat_with_weights_stab_cons$w0_tr <- w0_tr
dat_with_weights_stab_cons$w1_tr <- w1_tr[1] 27444
FALSE
67957
Mean positive days for lag treat_log_days at 15 days
[1] 2.772589
$:$:$:$:$:$:$:$:$:$:$:$:$:$:
Weights by plan type
$:$:$:$:$:$:$:$:$:$:$:$:$:$:
Running for: GP basic ambulatory
Running for: GP residential
Running for: WO residential
Running for: GP intensive ambulatory
Running for: WO intensive ambulatory
Running for: GP basic ambulatory
Running for: GP residential
Running for: WO residential
Running for: GP intensive ambulatory
Running for: WO intensive ambulatory
Extract weights
Add weight 1 in case of first treatment for time.lag==0
0
27445
| Weight | Min. | 1st Qu. | Median | Mean | 3rd Qu. | Max. |
|---|---|---|---|---|---|---|
| Primary (after PH, lag=0) | 0.15 | 1.00 | 1.02 | 1.46 | 1.78 | 7.35 |
| Alternative (after PH, lag=1) | 0.08 | 0.51 | 0.93 | 0.78 | 1.00 | 3.12 |
| type | n_non_na | n_na | min | q01 | q25 | median | mean | q75 | q99 | max | prop_lt_0.2 | prop_gt_5 | ESS |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Lag=0, raw | 67957 | 0 | 0.15 | 0.59 | 1.00 | 1.02 | 1.46 | 1.78 | 3.93 | 7.35 | 0.00 | 0 | 54514.18 |
| lag=0 trunc 2.5–97.5% | 67957 | 0 | 0.50 | 0.50 | 0.69 | 0.71 | 1.00 | 1.23 | 2.33 | 2.33 | 0.00 | 0 | 56078.17 |
| lag=1 raw | 67957 | 0 | 0.08 | 0.19 | 0.51 | 0.93 | 0.78 | 1.00 | 1.46 | 3.12 | 0.01 | 0 | 59071.99 |
| lag=1 trunc 2.5–97.5% | 67957 | 0 | 0.29 | 0.29 | 0.66 | 1.21 | 1.00 | 1.29 | 1.64 | 1.64 | 0.00 | 0 | 59698.29 |
Intermediate saving
Save the current R environment at that intermediate step.
Code
#save.image(file = file.path(wdpath, "data/20241015_out/psu", "intermediate.RData"))Association between non-completion and PSU, general
Code
gc()
wdpath<-
paste0(gsub("/cons","",gsub("cons","",paste0(getwd(),"/cons"))))
#load(file = file.path(wdpath, "data/20241015_out/psu", "intermediate.RData"))
if (interactive()) {
job::job({
gc()
suppressPackageStartupMessages({
library(dplyr)
library(sandwich)
})
## -------------------- MODELO --------------------
ff_formula <- tr_noncompletion ~ polysubstance_strict + age_c10 + year_c +
susinidum_oh + susinidum_coc + susinidum_pbc + susinidum_mar +
dg_psiq_cie_10_instudy + dg_psiq_cie_10_dg +
prim_sub_daily + occ24_inactive + occ24_unemployed +
susprindum_coc + susprindum_pbc + susprindum_mar
vars_ff <- all.vars(ff_formula)
vars_extra <- c("id", "w0_tr", "w1_tr")
vars_need <- unique(c(vars_ff, vars_extra))
# No data.table
base <- as.data.frame(dat_with_weights_stab_cons)[, vars_need, drop = FALSE]
# Model frame without NAs to construct 'keep' mask
mf_all <- stats::model.frame(ff_formula, data = base, na.action = NULL)
keep <- stats::complete.cases(mf_all)
if (!any(keep)) stop("No hay casos completos para la fórmula del modelo.")
mf <- mf_all[keep, , drop = FALSE] # datos alineados con la fórmula
id_vec <- base$id[keep]
if (anyNA(id_vec)) stop("La variable 'id' contiene NA tras el filtrado.")
id_int <- as.integer(factor(id_vec, exclude = NULL)) # clúster compacto y estable
## Clean weights (positive, finite, mean=1), aligned to 'keep'
clean_weights <- function(w) {
w[!is.finite(w) | is.na(w) | w <= 0] <- 1
w / mean(w, na.rm = TRUE)
}
w0 <- clean_weights(base$w0_tr[keep])
w1 <- clean_weights(base$w1_tr[keep])
stopifnot(is.numeric(w0), is.numeric(w1),
length(w0) == nrow(mf), length(w1) == nrow(mf))
ncl <- length(unique(id_int))
## -------------------- ROBUST --------------------
## --- Fit with cluster-robust SEs; embed weights into data to avoid scoping bugs ---
fit_glm_robust <- function(weights_vec = NULL) {
df <- mf
if (is.null(weights_vec)) {
m <- stats::glm(ff_formula, data = df, family = stats::poisson())
} else {
df$.__w <- weights_vec # <- embed weights here
m <- stats::glm(ff_formula, data = df, family = stats::poisson(), weights = .__w)
}
vc <- if (ncl >= 2) sandwich::vcovCL(m, cluster = id_int) else sandwich::vcovHC(m, type = "HC0")
list(model = m, vcov = vc)
}
m_un <- fit_glm_robust(NULL)
m_w0 <- fit_glm_robust(w0)
m_w1 <- fit_glm_robust(w1)
## -------------------- RESULTS --------------------
extract_results <- function(obj, name) {
co <- stats::coef(obj$model)
se <- sqrt(diag(obj$vcov))
z <- stats::qnorm(0.975)
out <- data.frame(
term = names(co),
est = exp(co),
cilo = exp(co - z * se),
ciup = exp(co + z * se),
p.value = 2 * stats::pnorm(abs(co / se), lower.tail = FALSE),
check.names = FALSE
)
names(out)[-1] <- paste0(names(out)[-1], "_", name)
out
}
est_summary <- Reduce(
function(x, y) dplyr::full_join(x, y, by = "term"),
list(
extract_results(m_un, "un"),
extract_results(m_w0, "w0"),
extract_results(m_w1, "w1")
)
)
utils::write.csv(est_summary, file.path(wdpath, "data/20241015_out/psu", "simple_results.csv"), row.names = FALSE)
cat("[ok] Saved in simple_results.csv (", nrow(est_summary), " rows)\n")
print(utils::head(est_summary), row.names = FALSE)
})
} else {
gc()
suppressPackageStartupMessages({
library(dplyr)
library(sandwich)
})
## -------------------- MODELO --------------------
ff_formula <- tr_noncompletion ~ polysubstance_strict + age_c10 + year_c +
susinidum_oh + susinidum_coc + susinidum_pbc + susinidum_mar +
dg_psiq_cie_10_instudy + dg_psiq_cie_10_dg +
prim_sub_daily + occ24_inactive + occ24_unemployed +
susprindum_coc + susprindum_pbc + susprindum_mar
vars_ff <- all.vars(ff_formula)
vars_extra <- c("id", "w0_tr", "w1_tr")
vars_need <- unique(c(vars_ff, vars_extra))
# No data.table
base <- as.data.frame(dat_with_weights_stab_cons)[, vars_need, drop = FALSE]
# Model frame without NAs to construct 'keep' mask
mf_all <- stats::model.frame(ff_formula, data = base, na.action = NULL)
keep <- stats::complete.cases(mf_all)
if (!any(keep)) stop("No hay casos completos para la fórmula del modelo.")
mf <- mf_all[keep, , drop = FALSE] # datos alineados con la fórmula
id_vec <- base$id[keep]
if (anyNA(id_vec)) stop("La variable 'id' contiene NA tras el filtrado.")
id_int <- as.integer(factor(id_vec, exclude = NULL)) # clúster compacto y estable
## Clean weights (positive, finite, mean=1), aligned to 'keep'
clean_weights <- function(w) {
w[!is.finite(w) | is.na(w) | w <= 0] <- 1
w / mean(w, na.rm = TRUE)
}
w0 <- clean_weights(base$w0_tr[keep])
w1 <- clean_weights(base$w1_tr[keep])
stopifnot(is.numeric(w0), is.numeric(w1),
length(w0) == nrow(mf), length(w1) == nrow(mf))
ncl <- length(unique(id_int))
## -------------------- ROBUST --------------------
## --- Fit with cluster-robust SEs; embed weights into data to avoid scoping bugs ---
fit_glm_robust <- function(weights_vec = NULL) {
df <- mf
if (is.null(weights_vec)) {
m <- stats::glm(ff_formula, data = df, family = stats::poisson())
} else {
df$.__w <- weights_vec # <- embed weights here
m <- stats::glm(ff_formula, data = df, family = stats::poisson(), weights = .__w)
}
vc <- if (ncl >= 2) sandwich::vcovCL(m, cluster = id_int) else sandwich::vcovHC(m, type = "HC0")
list(model = m, vcov = vc)
}
m_un <- fit_glm_robust(NULL)
m_w0 <- fit_glm_robust(w0)
m_w1 <- fit_glm_robust(w1)
## -------------------- RESULTS --------------------
extract_results <- function(obj, name) {
co <- stats::coef(obj$model)
se <- sqrt(diag(obj$vcov))
z <- stats::qnorm(0.975)
out <- data.frame(
term = names(co),
est = exp(co),
cilo = exp(co - z * se),
ciup = exp(co + z * se),
p.value = 2 * stats::pnorm(abs(co / se), lower.tail = FALSE),
check.names = FALSE
)
names(out)[-1] <- paste0(names(out)[-1], "_", name)
out
}
est_summary <- Reduce(
function(x, y) dplyr::full_join(x, y, by = "term"),
list(
extract_results(m_un, "un"),
extract_results(m_w0, "w0"),
extract_results(m_w1, "w1")
)
)
utils::write.csv(est_summary, file.path(wdpath, "data/20241015_out/psu", "simple_results.csv"), row.names = FALSE)
cat("[ok] Saved in simple_results.csv (", nrow(est_summary), " rows)\n")
print(utils::head(est_summary), row.names = FALSE)
}
gc()
est_summary |>
dplyr::rename(
var = term,
lo_un = cilo_un,
hi_un = ciup_un,
p_un = p.value_un,
lo_w0 = cilo_w0,
hi_w0 = ciup_w0,
p_w0 = p.value_w0,
lo_w1 = cilo_w1,
hi_w1 = ciup_w1,
p_w1 = p.value_w1
)|>
dplyr::filter(grepl("poly",var))|>
knitr::kable("markdown", caption="Summary of associations by weights, Generalized model", digits=2) used (Mb) gc trigger (Mb) max used (Mb)
Ncells 4502685 240.5 7763474 414.7 7763474 414.7
Vcells 337759966 2577.0 551857418 4210.4 459814485 3508.2
[ok] Saved in simple_results.csv ( 16 rows)
term est_un cilo_un ciup_un p.value_un est_w0
(Intercept) 0.5633872 0.5181064 0.6126254 4.479191e-41 0.5442844
polysubstance_strict 1.0430549 1.0309425 1.0553096 1.511658e-12 1.0511400
age_c10 0.8997996 0.8891297 0.9105975 2.055947e-67 0.8997624
year_c 0.9962844 0.9951192 0.9974509 4.517183e-10 0.9961477
susinidum_oh 1.0276423 0.9563340 1.1042677 4.574001e-01 1.0505182
susinidum_coc 1.0572570 0.9824007 1.1378172 1.372652e-01 1.0787129
cilo_w0 ciup_w0 p.value_w0 est_w1 cilo_w1 ciup_w1 p.value_w1
0.4963808 0.5968110 2.649867e-38 0.5608757 0.5141676 0.6118269 7.775737e-39
1.0376779 1.0647768 3.356649e-14 1.0457489 1.0327894 1.0588710 2.051321e-12
0.8881618 0.9115144 2.706333e-57 0.8968725 0.8856240 0.9082638 4.332718e-64
0.9948846 0.9974125 2.491781e-09 0.9960187 0.9947877 0.9972513 2.578989e-10
0.9703832 1.1372708 2.234709e-01 1.0225419 0.9495085 1.1011927 5.554597e-01
0.9948215 1.1696787 6.661339e-02 1.0500175 0.9734672 1.1325874 2.063375e-01
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 4529370 241.9 7763474 414.7 7763474 414.7
Vcells 346112712 2640.7 551857418 4210.4 511340968 3901.3
| var | est_un | lo_un | hi_un | p_un | est_w0 | lo_w0 | hi_w0 | p_w0 | est_w1 | lo_w1 | hi_w1 | p_w1 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| polysubstance_strict | 1.04 | 1.03 | 1.06 | 0 | 1.05 | 1.04 | 1.06 | 0 | 1.05 | 1.03 | 1.06 | 0 |
Descriptives, again
Code
cat("Survival time\n")
summary(dat$cens_time)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 6.7 90.2 124.9 121.1 157.4 304.9
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 7.452 95.484 129.484 125.146 159.806 304.871
cat("Discard patients with only 1 treatment\n")
# Pick the strata variable automatically
strata_var <- if ("polysubstance_strict" %in% names(dat_with_weights_stab_cons)) "polysubstance_strict" else "polysubstance"
# Variables of interest (as you listed)
vars_interest2 <- c(
"tr_noncompletion",
"biopsych_comp_severe", # Biopsychosocial compromise
"treat_lt_90",
"treat_log_days",
"adm_age_rec3", # Age at Admission (First Entry)
"year_decimal", # Birth year (assumed numeric or year-like)
"susinidum_oh", # Primary substance of the initial diagnosis
"susinidum_coc",
"susinidum_pbc",
"susinidum_mar",
"dg_psiq_cie_10_dg", # Psychiatric comorbidity (ICD-10)
"dg_psiq_cie_10_instudy", # Psychiatric comorbidity (ICD-10) [In study]
#"cie10_broad_cat", # Broad diagnostic categories
#"prim_sub_freq", # Frequency of primary substance use at admission
"prim_sub_daily", # Daily (0/1 you created)
"occ24_inactive", # Occupational status (corrected)
"occ24_unemployed",
"primary_sub_mod", # Primary substance at admission to treatment
"susprindum_oh",
"susprindum_coc",
"susprindum_pbc",
"susprindum_mar"
)
# Which of those should be treated as categorical in TableOne
factorVars_interest2 <- c(
"biopsych_comp_severe",
"susinidum_oh",
"susinidum_coc",
"susinidum_pbc",
"susinidum_mar",
"dg_psiq_cie_10_dg",
"dg_psiq_cie_10_instudy",
#"cie10_broad_cat",
#"prim_sub_freq",
"prim_sub_daily", # treat 0/1 as categorical to show counts/%
"occ24_inactive",
"occ24_unemployed",
"susprindum_oh",
"susprindum_coc",
"susprindum_pbc",
"susprindum_mar",
"tr_noncompletion",
"treat_lt_90"
)
# 2) Labels (these will be picked up by packages like table1/gtsummary; TableOne itself doesn’t use labels)
attr(dat_with_weights_stab_cons$tr_noncompletion, "label") <- "Non-completion status of treatment (Dropout / Misspelled)"
attr(dat_with_weights_stab_cons$biopsych_comp_severe, "label") <- "Biopsychosocial compromise (Severe)"
attr(dat_with_weights_stab_cons$treat_lt_90, "label") <- "Treatment duration (binary) (<90 days)"
attr(dat_with_weights_stab_cons$treat_log_days, "label") <- "Treatment duration (log-scaled days)"
attr(dat_with_weights_stab_cons$adm_age_rec3, "label") <- "Age at admission to treatment"
attr(dat_with_weights_stab_cons$year_decimal, "label") <- "Birth year"
# Primary substance (initial diagnosis) – label the variable; levels print as rows
attr(dat_with_weights_stab_cons$susinidum_oh, "label") <- "Primary substance (initial diagnosis), alcohol"
attr(dat_with_weights_stab_cons$susinidum_coc, "label") <- "Primary substance (initial diagnosis), cocaine hydrochloride"
attr(dat_with_weights_stab_cons$susinidum_pbc, "label") <- "Primary substance (initial diagnosis), cocaine base paste"
attr(dat_with_weights_stab_cons$susprindum_mar, "label") <- "Primary substance (initial diagnosis), marijuana"
# Psychiatric comorbidity
attr(dat_with_weights_stab_cons$dg_psiq_cie_10_instudy, "label") <- "Psychiatric comorbidity (ICD-10): In study"
attr(dat_with_weights_stab_cons$dg_psiq_cie_10_dg, "label") <- "Psychiatric comorbidity (ICD-10): Diagnosis"
# Frequency / Daily
attr(dat_with_weights_stab_cons$prim_sub_daily, "label") <- "Daily frequency of primary substance used at admission"
# Occupational status (levels like “Inactive”, “Unemployed” will print under this)
attr(dat_with_weights_stab_cons$occupation_condition_corr24,"label") <- "Occupational Status"
# Primary substance at admission – label the variable; levels print as rows
attr(dat_with_weights_stab_cons$susprindum_oh, "label") <- "Primary substance at admission to treatment, alcohol"
attr(dat_with_weights_stab_cons$susprindum_coc, "label") <- "Primary substance at admission to treatment, cocaine hydrochloride"
attr(dat_with_weights_stab_cons$susprindum_pbc, "label") <- "Primary substance at admission to treatment, cocaine base paste"
attr(dat_with_weights_stab_cons$susprindum_mar, "label") <- "Primary substance at admission to treatment, marijuana"
tbone_interest2 <- tableone::CreateTableOne(
vars = vars_interest2,
data = tidytable::slice_head(dat_with_weights_stab_cons,n = 1, .by = hash_key)|> select(all_of(c(vars_interest2, "polysubstance_strict"))),
factorVars = factorVars_interest2,
strata = strata_var,
addOverall = TRUE,
includeNA = TRUE,
smd = TRUE,
test = TRUE
)
# Make a composite key characteristic+level for ordering
df_tbone2 <- as.data.frame.TableOne(
tbone_interest2,
nonnormal = c("adm_age_rec3", "year_decimal", "treat_log_days"),
smd = TRUE
)
# Build desired order as a tibble with both characteristic and level
order_df <- tibble::tribble(
~characteristic, ~level,
"n", "",
"Non-completion status of treatment (Dropout / Misspelled) (%)", "0",
"Non-completion status of treatment (Dropout / Misspelled) (%)", "1",
"Biopsychosocial compromise (Severe) (%)", "0",
"Biopsychosocial compromise (Severe) (%)", "1",
"Biopsychosocial compromise (Severe) (%)", NA,
"Treatment duration (binary) (<90 days) (%)", "0",
"Treatment duration (binary) (<90 days) (%)", "1",
"Treatment duration (log-scaled days) (median [IQR])", "",
"Age at admission to treatment (median [IQR])", "",
"Birth year (median [IQR])", "",
"Primary substance (initial diagnosis) (%)", "cocaine powder",
"Primary substance (initial diagnosis) (%)", "cocaine paste",
"Primary substance (initial diagnosis) (%)", "marijuana",
"Primary substance (initial diagnosis) (%)", "alcohol",
"Primary substance (initial diagnosis) (%)", "others",
"Primary substance (initial diagnosis) (%)", NA,
"Psychiatric comorbidity (ICD-10): In study (%)", "TRUE",
"Psychiatric comorbidity (ICD-10): In study (%)", "FALSE",
"Psychiatric comorbidity (ICD-10): Diagnosis (%)", "TRUE",
"Daily frequency of primary substance used at admission (%)", "0",
"Daily frequency of primary substance used at admission (%)", "1",
"Daily frequency of primary substance used at admission (%)", NA,
"Occupational Status (%)", "inactive",
"Occupational Status (%)", "unemployed",
"Occupational Status (%)", "employed",
"Primary substance at admission to treatment (%)", "cocaine powder",
"Primary substance at admission to treatment (%)", "cocaine paste",
"Primary substance at admission to treatment (%)", "marijuana",
"Primary substance at admission to treatment (%)", "alcohol",
"Primary substance at admission to treatment (%)", "others"
)
# Join to assign sort order
df_tbone_ordered2 <- df_tbone2 %>%
left_join(order_df %>% mutate(order_id = dplyr::row_number()),
by = c("characteristic", "level")) %>%
arrange(order_id) %>%
select(-order_id)
df_tbone_ordered2|>
#filter(level!=0, level!=FALSE)|>
mutate(level=gsub("powder", "hydrochloride", level))|>
mutate(
Overall = gsub("\\(\\s+", "(", Overall),
`0` = gsub("\\(\\s+", "(", `0`),
`1` = gsub("\\(\\s+", "(", `1`)
)|>
select(characteristic, level, `0`, `1`, Overall, SMD)|>
tibble::as_tibble()|>
(\(df) {
format_cells(df,1, 1:length(names(df)), "bold")
})() |>
knitr::kable("markdown")Survival time
Min. 1st Qu. Median Mean 3rd Qu. Max.
7.452 95.484 129.484 125.146 159.806 304.871
Discard patients with only 1 treatment
| characteristic | level | 0 | 1 | Overall | SMD |
|---|---|---|---|---|---|
| n | 5956 | 21489 | 27445 | ||
| Non-completion status of treatment (Dropout / Misspelled) (%) | 0 | 1633 (27.4) | 4580 (21.3) | 6213 (22.6) | 0.143 |
| Non-completion status of treatment (Dropout / Misspelled) (%) | 1 | 4323 (72.6) | 16909 (78.7) | 21232 (77.4) | |
| Biopsychosocial compromise (Severe) (%) | 0 | 4417 (74.2) | 13094 (60.9) | 17511 (63.8) | 0.285 |
| Biopsychosocial compromise (Severe) (%) | 1 | 1539 (25.8) | 8395 (39.1) | 9934 (36.2) | |
| Treatment duration (binary) (<90 days) (%) | 0 | 4760 (79.9) | 16160 (75.2) | 20920 (76.2) | 0.113 |
| Treatment duration (binary) (<90 days) (%) | 1 | 1196 (20.1) | 5329 (24.8) | 6525 (23.8) | |
| Treatment duration (log-scaled days) (median [IQR]) | 5.25 [4.65, 5.80] | 5.13 [4.51, 5.70] | 5.16 [4.53, 5.72] | 0.146 | |
| Age at admission to treatment (median [IQR]) | 38.56 [30.44, 47.58] | 31.66 [26.12, 38.65] | 32.82 [26.76, 40.69] | 0.625 | |
| Birth year (median [IQR]) | 1977.48 [1968.61, 1985.57] | 1982.91 [1975.73, 1988.85] | 1982.07 [1974.10, 1988.34] | 0.492 | |
| Psychiatric comorbidity (ICD-10): In study (%) | TRUE | 862 (14.5) | 3903 (18.2) | 4765 (17.4) | |
| Psychiatric comorbidity (ICD-10): In study (%) | FALSE | 5094 (85.5) | 17586 (81.8) | 22680 (82.6) | 0.100 |
| Psychiatric comorbidity (ICD-10): Diagnosis (%) | TRUE | 2638 (44.3) | 10148 (47.2) | 12786 (46.6) | |
| Daily frequency of primary substance used at admission (%) | 0 | 3384 (56.8) | 11078 (51.6) | 14462 (52.7) | 0.106 |
| Daily frequency of primary substance used at admission (%) | 1 | 2572 (43.2) | 10411 (48.4) | 12983 (47.3) | |
| Primary substance (initial diagnosis), alcohol (%) | 0 | 1876 (31.5) | 7934 (36.9) | 9810 (35.7) | 0.115 |
| Primary substance (initial diagnosis), alcohol (%) | 1 | 4080 (68.5) | 13555 (63.1) | 17635 (64.3) | |
| Primary substance (initial diagnosis), cocaine hydrochloride (%) | 0 | 5463 (91.7) | 19805 (92.2) | 25268 (92.1) | 0.016 |
| Primary substance (initial diagnosis), cocaine hydrochloride (%) | 1 | 493 (8.3) | 1684 (7.8) | 2177 (7.9) | |
| Primary substance (initial diagnosis), cocaine base paste (%) | 0 | 5160 (86.6) | 19174 (89.2) | 24334 (88.7) | 0.080 |
| Primary substance (initial diagnosis), cocaine base paste (%) | 1 | 796 (13.4) | 2315 (10.8) | 3111 (11.3) | |
| susinidum_mar (%) | 0 | 5413 (90.9) | 17635 (82.1) | 23048 (84.0) | 0.260 |
| susinidum_mar (%) | 1 | 543 (9.1) | 3854 (17.9) | 4397 (16.0) | |
| Psychiatric comorbidity (ICD-10): Diagnosis (%) | FALSE | 3318 (55.7) | 11341 (52.8) | 14659 (53.4) | 0.059 |
| occ24_inactive (%) | 0 | 4716 (79.2) | 17275 (80.4) | 21991 (80.1) | 0.030 |
| occ24_inactive (%) | 1 | 1240 (20.8) | 4214 (19.6) | 5454 (19.9) | |
| occ24_unemployed (%) | 0 | 4198 (70.5) | 13226 (61.5) | 17424 (63.5) | 0.190 |
| occ24_unemployed (%) | 1 | 1758 (29.5) | 8263 (38.5) | 10021 (36.5) | |
| primary_sub_mod (%) | cocaine paste | 1720 (28.9) | 10507 (48.9) | 12227 (44.6) | 0.710 |
| primary_sub_mod (%) | cocaine hydrochloride | 810 (13.6) | 4789 (22.3) | 5599 (20.4) | |
| primary_sub_mod (%) | alcohol | 3138 (52.7) | 4435 (20.6) | 7573 (27.6) | |
| primary_sub_mod (%) | marijuana | 189 (3.2) | 1383 (6.4) | 1572 (5.7) | |
| primary_sub_mod (%) | others | 99 (1.7) | 375 (1.7) | 474 (1.7) | |
| Primary substance at admission to treatment, alcohol (%) | 0 | 2818 (47.3) | 17054 (79.4) | 19872 (72.4) | 0.705 |
| Primary substance at admission to treatment, alcohol (%) | 1 | 3138 (52.7) | 4435 (20.6) | 7573 (27.6) | |
| Primary substance at admission to treatment, cocaine hydrochloride (%) | 0 | 5146 (86.4) | 16700 (77.7) | 21846 (79.6) | 0.228 |
| Primary substance at admission to treatment, cocaine hydrochloride (%) | 1 | 810 (13.6) | 4789 (22.3) | 5599 (20.4) | |
| Primary substance at admission to treatment, cocaine base paste (%) | 0 | 4236 (71.1) | 10982 (51.1) | 15218 (55.4) | 0.420 |
| Primary substance at admission to treatment, cocaine base paste (%) | 1 | 1720 (28.9) | 10507 (48.9) | 12227 (44.6) | |
| Primary substance at admission to treatment, marijuana (%) | 0 | 5767 (96.8) | 20106 (93.6) | 25873 (94.3) | 0.153 |
| Primary substance at admission to treatment, marijuana (%) | 1 | 189 (3.2) | 1383 (6.4) | 1572 (5.7) |
Code
# 0) received an average of 2.3 SUD treatments
summary(dat_with_weights_stab_cons$max_treatment)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 2.000 2.000 2.000 2.796 3.000 13.000
# 0) Prevalence
tidytable::slice_head(dat_with_weights_stab_cons,n = 1, .by = hash_key) |> janitor::tabyl(polysubstance_strict)
# polysubstance_strict n percent
# 0 6543 0.2384041
# 1 20902 0.7615959
# 1) Cross-tab: non-completion × setting
tab1 <- table(tidytable::slice_head(dat_with_weights_stab_cons,n = 1, .by = hash_key)$plan_type_mod,
tidytable::slice_head(dat_with_weights_stab_cons,n = 1, .by = hash_key)$tr_noncompletion)
# 2) Cross-tab: PSU × setting (assuming PSU is in sex_rec or another column)
# e.g., suppose sex_rec actually encodes PSU vs single-use:
tab2 <- table(tidytable::slice_head(dat_with_weights_stab_cons,n = 1, .by = hash_key)$plan_type_mod,
tidytable::slice_head(dat_with_weights_stab_cons,n = 1, .by = hash_key)$sex_rec)
format_chisq <- function(test){
stat <- round(unname(test$statistic), 2)
df <- unname(test$parameter)
p <- test$p.value
# P-value formatting
p_str <- ifelse(p < .001, "<.001", sprintf("=%.3f", p))
glue::glue("χ²({df}) = {stat}, P {p_str}")
}
format_residuals <- function(resid_mat, cutoff = 2) {
# Two-sided p-values from residuals ~ N(0,1)
pvals <- 2 * (1 - pnorm(abs(resid_mat)))
as.data.frame(as.table(resid_mat)) |>
dplyr::mutate(
pval = as.vector(pvals),
signif = sprintf("%.2f", round(pval, 3)),
Residual = sprintf("%.2f", Freq)
) |>
dplyr::filter(abs(as.numeric(Residual)) >= cutoff) |>
dplyr::select(Var1, Var2, Residual, signif)
}
# χ²(4) = 400.46, P <.001
# χ²(4) = 10680.52, P <.001
#
# Var1 Var2 Residual signif
# 1 GP basic ambulatory 0 -3.55 0.00
# 2 GP intensive ambulatory 0 -4.96 0.00
# 3 GP residential 0 16.17 0.00
# 4 WO intensive ambulatory 0 -3.64 0.00
# 5 WO residential 0 4.17 0.00
# 6 GP intensive ambulatory 1 2.30 0.02
# 7 GP residential 1 -7.51 0.00
# Var1 Var2 Residual signif
# 1 GP basic ambulatory hombre 9.33 0.00
# 2 GP intensive ambulatory hombre 11.85 0.00
# 3 GP residential hombre 17.95 0.00
# 4 WO intensive ambulatory hombre -40.54 0.00
# 5 WO residential hombre -33.60 0.00
# 6 GP basic ambulatory mujer -13.89 0.00
# 7 GP intensive ambulatory mujer -17.63 0.00
# 8 GP residential mujer -26.71 0.00
# 9 WO intensive ambulatory mujer 60.34 0.00
# 10 WO residential mujer 50.01 0.00
format_residuals(chisq.test(tab1)$residuals) |> knitr::kable("markdown", caption= paste0("non-completion × setting (", format_chisq(chisq.test(tab1)),")"))
format_residuals(chisq.test(tab2)$residuals) |> knitr::kable("markdown", caption= paste0("PSU x setting (", format_chisq(chisq.test(tab2)),")"))
woolf_manual <- function(tab3d) {
k <- dim(tab3d)[3]
logOR <- var_logOR <- numeric(k)
for (j in 1:k) {
a <- tab3d[1,1,j]; b <- tab3d[1,2,j]
c <- tab3d[2,1,j]; d <- tab3d[2,2,j]
logOR[j] <- log((a*d)/(b*c))
var_logOR[j] <- 1/a + 1/b + 1/c + 1/d
}
w <- 1/var_logOR
theta_bar <- sum(w * logOR) / sum(w)
X2 <- sum(w * (logOR - theta_bar)^2)
df <- k - 1
p <- 1 - pchisq(X2, df)
list(statistic = X2, df = df, p.value = p,
logOR = logOR, OR = exp(logOR), var = var_logOR)
}
# Example with your data:
tab <- xtabs(~ polysubstance_strict + tr_noncompletion + plan_type_mod,
data = tidytable::slice_head(dat_with_weights_stab_cons,n = 1, .by = hash_key))
woolf_manual(tab)
# $statistic
# [1] 55.8
#
# $df
# [1] 4
#
# $p.value
# [1] 2.18e-11
# $statistic
# [1] 24.37842
#
# $df
# [1] 4
#
# $p.value
# [1] 0.0000670675
assoc_by_stratum <- dat_with_weights_stab_cons %>%
split(.$plan_type_mod) %>%
map(~ {
tab <- table(.x$polysubstance_strict, .x$tr_noncompletion)
test <- suppressWarnings(chisq.test(tab)) # χ² test
stat <- unname(test$statistic)
df <- unname(test$parameter)
pval <- test$p.value
# Format nicely
p_str <- ifelse(pval < .001, "<.001", sprintf("=%.2f", pval))
tibble::tibble(
plan_type_mod = unique(.x$plan_type_mod),
chisq_string = sprintf("χ²(%d) = %.2f, P %s", df, stat, p_str)
)
}) %>%
bind_rows()
# plan_type_mod chisq_string
# <chr> <chr>
# 1 GP basic ambulatory χ²(1) = 335.27, P <.001
# 2 GP intensive ambulatory χ²(1) = 241.82, P <.001
# 3 GP residential χ²(1) = 22.78, P <.001
# 4 WO intensive ambulatory χ²(1) = 32.19, P <.001
# 5 WO residential χ²(1) = 19.40, P <.001
# 1 GP basic ambulatory χ²(1) = 314.19, P <.001
# 2 GP intensive ambulatory χ²(1) = 216.23, P <.001
# 3 GP residential χ²(1) = 16.22, P <.001
# 4 WO intensive ambulatory χ²(1) = 27.09, P <.001
# 5 WO residential χ²(1) = 14.88, P <.001
knitr::kable(assoc_by_stratum, "markdown", caption= "Association between noncompletion and PSU") Min. 1st Qu. Median Mean 3rd Qu. Max.
2.000 2.000 2.000 2.796 3.000 13.000
polysubstance_strict n percent
0 5956 0.2170158
1 21489 0.7829842
| Var1 | Var2 | Residual | signif |
|---|---|---|---|
| GP basic ambulatory | 0 | -6.00 | 0.00 |
| GP intensive ambulatory | 0 | -3.44 | 0.00 |
| GP residential | 0 | 12.70 | 0.00 |
| WO residential | 0 | 3.72 | 0.00 |
| GP basic ambulatory | 1 | 3.24 | 0.00 |
| GP residential | 1 | -6.87 | 0.00 |
| WO residential | 1 | -2.01 | 0.04 |
| Var1 | Var2 | Residual | signif |
|---|---|---|---|
| GP basic ambulatory | hombre | 9.05 | 0.00 |
| GP intensive ambulatory | hombre | 10.91 | 0.00 |
| GP residential | hombre | 19.12 | 0.00 |
| WO intensive ambulatory | hombre | -35.58 | 0.00 |
| WO residential | hombre | -37.45 | 0.00 |
| GP basic ambulatory | mujer | -13.68 | 0.00 |
| GP intensive ambulatory | mujer | -16.49 | 0.00 |
| GP residential | mujer | -28.89 | 0.00 |
| WO intensive ambulatory | mujer | 53.76 | 0.00 |
| WO residential | mujer | 56.60 | 0.00 |
$statistic
[1] 14.89707
$df
[1] 4
$p.value
[1] 0.004919527
$logOR
[1] 0.5419830 0.3926138 0.1484303 0.2906816 0.2998210
$OR
[1] 1.719413 1.480846 1.160012 1.337339 1.349617
$var
[1] 0.003268931 0.003143068 0.008553028 0.017393284 0.016000554
| plan_type_mod | chisq_string |
|---|---|
| GP basic ambulatory | χ²(1) = 242.25, P <.001 |
| GP intensive ambulatory | χ²(1) = 163.65, P <.001 |
| GP residential | χ²(1) = 13.58, P <.001 |
| WO intensive ambulatory | χ²(1) = 22.12, P <.001 |
| WO residential | χ²(1) = 58.00, P <.001 |
The association between PSU and treatment non-completion varied significantly across treatment settings [Woolf test: χ²(4)=55.8, P<.001]. At the stratum level, PSU was consistently associated with increased odds of non-completion, but the strength differed: strongest in general-population basic ambulatory (OR≈1.89, χ²(1)=335.3, P<.001) and intensive ambulatory (OR≈1.60, χ²(1)=241.8, P<.001), and more modest in residential settings (GP: OR≈1.26, χ²(1)=22.8, P<.001; WO: OR≈1.34, χ²(1)=19.4, P<.001).
Code
dat_with_weights_stab_cons %>%
tidytable::group_by(id) %>%
tidytable::mutate(max_treatment = tidytable::n()) %>%
tidytable::mutate(
psu_any = as.integer(any(polysubstance_strict == 1, na.rm = TRUE))
) %>%
tidytable::group_by(max_treatment) %>%
tidytable::summarise(
n = tidytable::n(),
psu_rate = mean(psu_any),
.by = max_treatment
) %>%
knitr::kable("markdown", caption= "Proportion of patients with PSU, by no. of treatments")
dat_with_weights_stab_cons %>%
tidytable::arrange(id, treatment) %>%
tidytable::group_by(id) %>%
tidytable::filter(tidytable::n() >= 2) %>%
tidytable::summarise(
psu_adm = first(polysubstance_strict),
psu_any = as.integer(any(polysubstance_strict == 1, na.rm = TRUE)),
psu_later = as.integer(
any(polysubstance_strict == 1 & row_number() > 1, na.rm = TRUE)
)
) %>%
tidytable::summarise(
total = tidytable::n(),
adm_1 = sum(psu_adm == 1),
any_1 = sum(psu_any == 1),
later_only = sum(psu_adm == 0 & psu_any == 1),
.by = NULL
) |>
knitr::kable(
format = "markdown",
caption = "Summary of PSU across repeated treatments",
col.names= c("Total patients",
"PSU at admission",
"PSU at any treatment episode",
"PSU, later-only")
)
# | Total patients| PSU at admission| PSU at any treatment episode| PSU, later-only|
# |--------------:|----------------:|----------------------------:|---------------:|
# | 27445| 21489| 24231| 2742|| max_treatment | n | psu_rate |
|---|---|---|
| 2 | 38040 | 0.8590957 |
| 3 | 16554 | 0.9209859 |
| 4 | 7344 | 0.9542484 |
| 5 | 3275 | 0.9816794 |
| 6 | 1596 | 0.9924812 |
| 7 | 602 | 1.0000000 |
| 8 | 320 | 1.0000000 |
| 9 | 153 | 1.0000000 |
| 10 | 60 | 1.0000000 |
| 13 | 13 | 1.0000000 |
| Total patients | PSU at admission | PSU at any treatment episode | PSU, later-only |
|---|---|---|---|
| 27445 | 21489 | 24231 | 2742 |
People with 2+ treatments have higher PSU rates (85.5%–100%) than those with only 1 treatment (64.2%).
Table 2
Code
# 0) Optional time window restriction if needed (keeps records within 2010-2019)
# Comment out if dat is already restricted.
# 1) Person-level cohort: multiple SUD treatments
px <- dat_with_weights_stab_cons %>%
tidytable::ungroup() %>%
tidytable::arrange(id, treatment) %>%
tidytable::group_by(id) %>%
tidytable::filter(max_treatment >= 2) %>% # multiple treatments
tidytable::summarise(
# PSU flags
psu_adm = first(polysubstance_strict), # PSU at first treatment
psu_any = as.integer(any(polysubstance_strict == 1, na.rm = TRUE)), # PSU in any treatment
# Non-completion events
nc_first = first(tr_noncompletion), # non-completion at first treatment (0/1)
nc_any = as.integer(any(tr_noncompletion == 1, na.rm = TRUE)), # ≥1 non-completion
# Follow-up time in months (person-months)
pt = suppressWarnings(first(cens_time)) # person-level follow-up time
) %>%
tidytable::ungroup() %>%
tidytable::mutate(
psu_adm_lbl = tidytable::if_else(psu_adm == 1, "Reported", "Not reported"),
psu_any_lbl = tidytable::if_else(psu_any == 1, "Reported", "Not reported")
)
# Helper to compute IR and normal-approx 95% CI per 1,000 person-months
summarise_block <- function(df, group_var, event_col) {
df %>%
tidytable::summarise(
n = tidytable::n(),
prevalence = mean({{ event_col }} == 1, na.rm = TRUE),
follow_up_time = sum(pt, na.rm = TRUE),
events = sum({{ event_col }}, na.rm = TRUE),
.by = {{ group_var }}
) %>%
tidytable::mutate(
IR = 1000 * events / follow_up_time,
se = 1000 * sqrt(pmax(events, 1e-9)) / follow_up_time,
lcl = IR - 1.96 * se,
ucl = IR + 1.96 * se,
`IR (95% CI)` = sprintf("%.2f (%.2f, %.2f)", IR, lcl, ucl),
Prevalence = scales::percent(prevalence, accuracy = 1),
`Follow-up time`= scales::number(follow_up_time, accuracy = 0.01, big.mark = ","),
`Polysubstance use` = paste0({{ group_var }}, " (n= ", scales::comma(n), ")")
) %>%
tidytable::select(
`Polysubstance use`, Prevalence, `Follow-up time`, Events = events, `IR (95% CI)`
)
}
# 2) Four table panels
# A) PSU at admission AND at least one event of non-completion
tab_A <- summarise_block(px, psu_adm_lbl, nc_any) %>%
mutate(Type = "PSU at admission and at least one event of non-completion")
# B) PSU at admission AND non-completion at first treatment
tab_B <- summarise_block(px, psu_adm_lbl, nc_first) %>%
mutate(Type = "PSU at admission and non-completion at first treatment")
# C) ≥1 treatment reporting PSU AND ≥1 event of non-completion
tab_C <- summarise_block(px, psu_any_lbl, nc_any) %>%
mutate(Type = "At least one treatment reporting PSU and at least one event of non-completion")
# D) ≥1 treatment reporting PSU AND non-completion at first treatment
tab_D <- summarise_block(px, psu_any_lbl, nc_first) %>%
mutate(Type = "At least one treatment reporting PSU and non-completion at the first treatment")
# 3) Bind and order like Table 2
table_2 <- bind_rows(tab_A, tab_B, tab_C, tab_D) %>%
tidytable::mutate(
Type = factor(
Type,
c(
"PSU at admission and at least one event of non-completion",
"PSU at admission and non-completion at first treatment",
"At least one treatment reporting PSU and at least one event of non-completion",
"At least one treatment reporting PSU and non-completion at the first treatment"
)
),
grp_ord = factor(gsub(" \\(n=.*", "", `Polysubstance use`),
levels = c("Not reported", "Reported"))
) %>%
tidytable::arrange(Type, grp_ord) %>%
tidytable::select(-grp_ord)
# 4) Print
knitr::kable(
table_2,
align = "lrrrr",
col.names = c("type","Polysubstance use", "Prevalence", "Follow-up time", "Events", "IR (95% CI)")
)
# type Polysubstance use Prevalence Follow-up time Events IR (95% CI)
# Not reported (n= 6,909) 91% 734,496.81 6314 8.60 (8.38, 8.81) PSU at admission and at least one event of non-completion
# Reported (n= 24,360) 95% 2,948,798.17 23163 7.86 (7.75, 7.96) PSU at admission and at least one event of non-completion
# Not reported (n= 6,909) 79% 734,496.81 5429 7.39 (7.19, 7.59) PSU at admission and non-completion at first treatment
# Reported (n= 24,360) 83% 2,948,798.17 20293 6.88 (6.79, 6.98) PSU at admission and non-completion at first treatment
# Not reported (n= 3,600) 87% 364,607.66 3145 8.63 (8.32, 8.93) At least one treatment reporting PSU and at least one event of non-completion
# Reported (n= 27,669) 95% 3,318,687.31 26332 7.93 (7.84, 8.03) At least one treatment reporting PSU and at least one event of non-completion
# Not reported (n= 3,600) 74% 364,607.66 2650 7.27 (6.99, 7.54) At least one treatment reporting PSU and non-completion at the first treatment
# Reported (n= 27,669) 83% 3,318,687.31 23072 6.95 (6.86, 7.04) At least one treatment reporting PSU and non-completion at the first treatment| type | Polysubstance use | Prevalence | Follow-up time | Events | IR (95% CI) |
|---|---|---|---|---|---|
| Not reported (n= 5,956) | 88% | 654,592.44 | 5268 | 8.05 (7.83, 8.27) | PSU at admission and at least one event of non-completion |
| Reported (n= 21,489) | 93% | 2,686,018.20 | 20001 | 7.45 (7.34, 7.55) | PSU at admission and at least one event of non-completion |
| Not reported (n= 5,956) | 73% | 654,592.44 | 4323 | 6.60 (6.41, 6.80) | PSU at admission and non-completion at first treatment |
| Reported (n= 21,489) | 79% | 2,686,018.20 | 16909 | 6.30 (6.20, 6.39) | PSU at admission and non-completion at first treatment |
| Not reported (n= 3,214) | 84% | 336,381.49 | 2702 | 8.03 (7.73, 8.34) | At least one treatment reporting PSU and at least one event of non-completion |
| Reported (n= 24,231) | 93% | 3,004,229.15 | 22567 | 7.51 (7.41, 7.61) | At least one treatment reporting PSU and at least one event of non-completion |
| Not reported (n= 3,214) | 67% | 336,381.49 | 2165 | 6.44 (6.17, 6.71) | At least one treatment reporting PSU and non-completion at the first treatment |
| Reported (n= 24,231) | 79% | 3,004,229.15 | 19067 | 6.35 (6.26, 6.44) | At least one treatment reporting PSU and non-completion at the first treatment |
Stratify by plan type
Code
cat("percentage of women by plan type (women, vs. gp)\n")
subset(dat_with_weights_stab_cons, treatment==1, c("sex_rec","plan_type_mod")) |> mutate(wo=grepl("WO",plan_type_mod)) |> janitor::tabyl(sex_rec, wo) |> janitor::adorn_percentages("col")
# sex_rec FALSE TRUE
# hombre 0.8084971 0
# mujer 0.1915029 1
tab3<-
split(dat_with_weights_stab_cons, dat_with_weights_stab_cons$plan_type_mod)|>
purrr::imap(~{
df <- .x|> dplyr::mutate(id = as.character(id))
# Clean formula: NO cluster()/strata()/id inside
ff_local <- tr_noncompletion ~ polysubstance_strict +
age_c10 + year_c +
susinidum_oh + susinidum_coc + susinidum_pbc + susinidum_mar +
dg_psiq_cie_10_instudy + dg_psiq_cie_10_dg +
prim_sub_daily + occ24_inactive + occ24_unemployed +
susprindum_coc + susprindum_pbc + susprindum_mar
# Freeze row universe once
mf0 <- stats::model.frame(ff_local, data = df, na.action = stats::na.pass)
keep <- stats::complete.cases(mf0)
mf <- mf0[keep, , drop = FALSE]
if (nrow(mf) == 0L) return(NULL)
# Outcome variation check
y <- stats::model.response(mf)
if (length(unique(y)) < 2L) {
return(tibble::tibble(
plan_type_mod = as.character(.y),
n_rows = nrow(mf),
term = "(skipped: no outcome variation)",
estimate = NA_real_, conf.low = NA_real_, conf.high = NA_real_
))
}
# Design matrix and term mapping
X <- stats::model.matrix(ff_local, mf)
assign_vec <- attr(X, "assign") # maps columns -> term index
term_labels <- attr(stats::terms(ff_local), "term.labels")
# 1) Drop constant columns (except intercept)
const <- vapply(as.data.frame(X), function(v) length(unique(v)) == 1L, logical(1))
const[colnames(X) == "(Intercept)"] <- FALSE
if (any(const)) {
X <- X[, !const, drop = FALSE]
assign_vec <- assign_vec[!const]
}
# 2) Drop collinear columns via QR
qrX <- qr(X)
keep_cols <- sort(qrX$pivot[seq_len(qrX$rank)])
X_reduced <- X[, keep_cols, drop = FALSE]
assign_kept <- assign_vec[keep_cols]
# 3) Map kept columns back to ORIGINAL term labels
kept_term_idx <- sort(unique(assign_kept[assign_kept > 0])) # >0 skips intercept
if (length(kept_term_idx) == 0L) {
return(tibble::tibble(
plan_type_mod = as.character(.y),
n_rows = nrow(mf),
term = "(no predictors after cleanup)",
estimate = NA_real_, conf.low = NA_real_, conf.high = NA_real_
))
}
kept_terms <- term_labels[kept_term_idx]
# 4) Rebuild reduced formula from ORIGINAL terms
ff_reduced <- stats::reformulate(kept_terms, response = "tr_noncompletion")
# 5) Align id to kept rows (same `keep` mask)
id_vec <- df$id[keep]
# 6) Fit GEE (intercept-only scale)
fit <- geepack::geeglm(
ff_reduced,
id = id_vec,
data = mf,
family = poisson,
corstr = "independence",
scale.fix = TRUE,
na.action = stats::na.exclude
#, weights = df$w0_tr[keep] # or w1_tr, if you want weights by stratum
)
broom::tidy(fit, exponentiate = TRUE, conf.int = TRUE)|>
dplyr::transmute(
plan_type_mod = as.character(.y),
n_rows = nrow(mf),
term,
`RR (95% CI)` = sprintf("%.4f (%.4f, %.4f)", estimate, conf.low, conf.high)
)
})|>
dplyr::bind_rows()|>
dplyr::relocate(plan_type_mod, n_rows)
tab3_w0<-
split(dat_with_weights_stab_cons, dat$plan_type_mod)|>
purrr::imap(~{
df <- .x|> dplyr::mutate(id = as.character(id))
# Clean formula: NO cluster()/strata()/id inside
ff_local <- tr_noncompletion ~ polysubstance_strict +
age_c10 + year_c +
susinidum_oh + susinidum_coc + susinidum_pbc + susinidum_mar +
dg_psiq_cie_10_instudy + dg_psiq_cie_10_dg +
prim_sub_daily + occ24_inactive + occ24_unemployed +
susprindum_coc + susprindum_pbc + susprindum_mar
# Freeze row universe once
mf0 <- stats::model.frame(ff_local, data = df, na.action = stats::na.pass)
keep <- stats::complete.cases(mf0)
mf <- mf0[keep, , drop = FALSE]
if (nrow(mf) == 0L) return(NULL)
# Outcome variation check
y <- stats::model.response(mf)
if (length(unique(y)) < 2L) {
return(tibble::tibble(
plan_type_mod = as.character(.y),
n_rows = nrow(mf),
term = "(skipped: no outcome variation)",
estimate = NA_real_, conf.low = NA_real_, conf.high = NA_real_
))
}
# Design matrix and term mapping
X <- stats::model.matrix(ff_local, mf)
assign_vec <- attr(X, "assign") # maps columns -> term index
term_labels <- attr(stats::terms(ff_local), "term.labels")
# 1) Drop constant columns (except intercept)
const <- vapply(as.data.frame(X), function(v) length(unique(v)) == 1L, logical(1))
const[colnames(X) == "(Intercept)"] <- FALSE
if (any(const)) {
X <- X[, !const, drop = FALSE]
assign_vec <- assign_vec[!const]
}
# 2) Drop collinear columns via QR
qrX <- qr(X)
keep_cols <- sort(qrX$pivot[seq_len(qrX$rank)])
X_reduced <- X[, keep_cols, drop = FALSE]
assign_kept <- assign_vec[keep_cols]
# 3) Map kept columns back to ORIGINAL term labels
kept_term_idx <- sort(unique(assign_kept[assign_kept > 0])) # >0 skips intercept
if (length(kept_term_idx) == 0L) {
return(tibble::tibble(
plan_type_mod = as.character(.y),
n_rows = nrow(mf),
term = "(no predictors after cleanup)",
estimate = NA_real_, conf.low = NA_real_, conf.high = NA_real_
))
}
kept_terms <- term_labels[kept_term_idx]
# 4) Rebuild reduced formula from ORIGINAL terms
ff_reduced <- stats::reformulate(kept_terms, response = "tr_noncompletion")
# 5) Align id to kept rows (same `keep` mask)
id_vec <- df$id[keep]
# 6) Fit GEE (intercept-only scale)
fit <- geepack::geeglm(
ff_reduced,
id = id_vec,
data = mf,
family = poisson,
corstr = "independence",
scale.fix = TRUE,
na.action = stats::na.exclude
, weights = df$w0_tr[keep] # or w1_tr, if you want weights by stratum
)
broom::tidy(fit, exponentiate = TRUE, conf.int = TRUE)|>
dplyr::transmute(
plan_type_mod = as.character(.y),
n_rows = nrow(mf),
term,
`RR (95% CI)` = sprintf("%.4f (%.4f, %.4f)", estimate, conf.low, conf.high)
)
})|>
dplyr::bind_rows()|>
dplyr::relocate(plan_type_mod, n_rows)
tab3_w1<-
split(dat_with_weights_stab_cons, dat$plan_type_mod)|>
purrr::imap(~{
df <- .x|> dplyr::mutate(id = as.character(id))
# Clean formula: NO cluster()/strata()/id inside
ff_local <- tr_noncompletion ~ polysubstance_strict +
age_c10 + year_c +
susinidum_oh + susinidum_coc + susinidum_pbc + susinidum_mar +
dg_psiq_cie_10_instudy + dg_psiq_cie_10_dg +
prim_sub_daily + occ24_inactive + occ24_unemployed +
susprindum_coc + susprindum_pbc + susprindum_mar
# Freeze row universe once
mf0 <- stats::model.frame(ff_local, data = df, na.action = stats::na.pass)
keep <- stats::complete.cases(mf0)
mf <- mf0[keep, , drop = FALSE]
if (nrow(mf) == 0L) return(NULL)
# Outcome variation check
y <- stats::model.response(mf)
if (length(unique(y)) < 2L) {
return(tibble::tibble(
plan_type_mod = as.character(.y),
n_rows = nrow(mf),
term = "(skipped: no outcome variation)",
estimate = NA_real_, conf.low = NA_real_, conf.high = NA_real_
))
}
# Design matrix and term mapping
X <- stats::model.matrix(ff_local, mf)
assign_vec <- attr(X, "assign") # maps columns -> term index
term_labels <- attr(stats::terms(ff_local), "term.labels")
# 1) Drop constant columns (except intercept)
const <- vapply(as.data.frame(X), function(v) length(unique(v)) == 1L, logical(1))
const[colnames(X) == "(Intercept)"] <- FALSE
if (any(const)) {
X <- X[, !const, drop = FALSE]
assign_vec <- assign_vec[!const]
}
# 2) Drop collinear columns via QR
qrX <- qr(X)
keep_cols <- sort(qrX$pivot[seq_len(qrX$rank)])
X_reduced <- X[, keep_cols, drop = FALSE]
assign_kept <- assign_vec[keep_cols]
# 3) Map kept columns back to ORIGINAL term labels
kept_term_idx <- sort(unique(assign_kept[assign_kept > 0])) # >0 skips intercept
if (length(kept_term_idx) == 0L) {
return(tibble::tibble(
plan_type_mod = as.character(.y),
n_rows = nrow(mf),
term = "(no predictors after cleanup)",
estimate = NA_real_, conf.low = NA_real_, conf.high = NA_real_
))
}
kept_terms <- term_labels[kept_term_idx]
# 4) Rebuild reduced formula from ORIGINAL terms
ff_reduced <- stats::reformulate(kept_terms, response = "tr_noncompletion")
# 5) Align id to kept rows (same `keep` mask)
id_vec <- df$id[keep]
# 6) Fit GEE (intercept-only scale)
fit <- geepack::geeglm(
ff_reduced,
id = id_vec,
data = mf,
family = poisson,
corstr = "independence",
scale.fix = TRUE,
na.action = stats::na.exclude
, weights = df$w1_tr[keep] # or w1_tr, if you want weights by stratum
)
broom::tidy(fit, exponentiate = TRUE, conf.int = TRUE)|>
dplyr::transmute(
plan_type_mod = as.character(.y),
n_rows = nrow(mf),
term,
`RR (95% CI)` = sprintf("%.4f (%.4f, %.4f)", estimate, conf.low, conf.high)
)
})|>
dplyr::bind_rows()|>
dplyr::relocate(plan_type_mod, n_rows)
format_ci <- function(x) {
m <- stringr::str_match(x, "([0-9.]+) \\(([^,]+), ([^)]+)\\)")
est <- as.numeric(m[,2])
lo <- as.numeric(m[,3])
hi <- as.numeric(m[,4])
sprintf("%.2f (%.2f, %.2f)", est, lo, hi)
}
rbind.data.frame(
cbind.data.frame(w="no weight", filter(tab3, grepl("poly", term))),
cbind.data.frame(w="weight lag=0", filter(tab3_w0, grepl("poly", term))),
cbind.data.frame(w="weight lag=1", filter(tab3_w1, grepl("poly", term)))
)|>
pivot_wider(
names_from = w,
values_from = `RR (95% CI)`
)|>
relocate(plan_type_mod, n_rows, term)|>
select(-term)|>
(\(df) {
cat(paste0("Total cases included in the model: ",
formatC(summarise(df, sum_n= sum(n_rows)) |> pull(sum_n), big.mark = ","), "\n"))
utils::write.csv(df, file.path(wdpath, "data/20241015_out/psu", "stratified_results.csv"), row.names = FALSE)
df ->> stratified_results
df
})()|>
mutate(across(
c(`no weight`, `weight lag=0`, `weight lag=1`),
format_ci
))|>
knitr::kable("markdown", digits = 2)
# |plan_type_mod | n_rows|no weight |weight lag=0 |weight lag=1 |
# |:-----------------------|------:|:-----------------|:-----------------|:-----------------|
# |GP basic ambulatory | 20427|1.30 (1.20, 1.40) |1.30 (1.19, 1.41) |1.27 (1.16, 1.38) |
# |GP intensive ambulatory | 33782|1.21 (1.13, 1.29) |1.21 (1.13, 1.31) |1.17 (1.09, 1.26) |
# |GP residential | 13991|0.96 (0.86, 1.08) |0.98 (0.86, 1.10) |0.95 (0.84, 1.07) |
# |WO intensive ambulatory | 6476|1.24 (1.07, 1.43) |1.21 (1.04, 1.42) |1.26 (1.08, 1.47) |
# |WO residential | 6733|1.14 (0.98, 1.32) |1.10 (0.93, 1.29) |1.14 (0.97, 1.35) |
#67,957percentage of women by plan type (women, vs. gp)
sex_rec FALSE TRUE
hombre 0.8084971 0
mujer 0.1915029 1
Total cases included in the model: 67,957
| plan_type_mod | n_rows | no weight | weight lag=0 | weight lag=1 |
|---|---|---|---|---|
| GP basic ambulatory | 20753 | 1.05 (1.03, 1.07) | 1.06 (1.04, 1.08) | 1.06 (1.03, 1.08) |
| GP intensive ambulatory | 26825 | 1.04 (1.02, 1.06) | 1.04 (1.02, 1.06) | 1.04 (1.02, 1.06) |
| GP residential | 10410 | 1.02 (0.99, 1.06) | 1.03 (0.99, 1.07) | 1.01 (0.97, 1.05) |
| WO intensive ambulatory | 4557 | 1.04 (1.00, 1.09) | 1.05 (1.00, 1.10) | 1.05 (1.00, 1.09) |
| WO residential | 5412 | 1.12 (1.07, 1.17) | 1.13 (1.07, 1.18) | 1.13 (1.07, 1.18) |
Code
library(metafor)Code
# Helper to extract numbers
parse_ci <- function(x) {
m <- stringr::str_match(x, "([0-9.]+) \\(([^,]+), ([^)]+)\\)")
tibble::tibble(
est = as.numeric(m[,2]),
ci_lo = as.numeric(m[,3]),
ci_hi = as.numeric(m[,4])
)
}
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
##_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
# Reshape into long format & parse
df_long <- as.data.frame(stratified_results) %>%
pivot_longer(
cols = c(`no weight`, `weight lag=0`, `weight lag=1`),
names_to = "model", values_to = "res"
) %>%
mutate(parsed = lapply(res, parse_ci)) %>%
tidyr::unnest_wider(parsed) %>% # <-- force tidyr version
mutate(
se = (ci_hi - ci_lo)/(2*1.96),
yi = log(est)
)
# Run Cochrane’s Q (heterogeneity) per plan_type_mod
cochran_by_weights <- df_long %>%
dplyr::group_by(model) %>%
dplyr::group_map(~{
r <- rma.uni(yi = .x$yi, sei = .x$se, method = "FE")
tibble::tibble(
model = .y$model,
Q = r$QE,
df = r$k - 1,
pval = r$QEp,
pooled_est = exp(r$b),
pooled_ci = paste0(
round(exp(r$ci.lb), 2), " – ", round(exp(r$ci.ub), 2)
)
)
}) %>%
dplyr::bind_rows()
cochran_by_weights |>
knitr::kable("markdown", caption= "Heterogeneity by plan type")| model | Q | df | pval | pooled_est | pooled_ci |
|---|---|---|---|---|---|
| no weight | 9.471652 | 4 | 0.0503330 | 1.044810 | 1.03 – 1.06 |
| weight lag=0 | 7.843235 | 4 | 0.0974924 | 1.053093 | 1.04 – 1.07 |
| weight lag=1 | 12.103026 | 4 | 0.0166014 | 1.047039 | 1.03 – 1.06 |
Session info
Code
#|echo: true
#|error: true
#|message: true
#|paged.print: true
message(paste0("R library: ", Sys.getenv("R_LIBS_USER")))Code
message(paste0("Date: ",withr::with_locale(new = c('LC_TIME' = 'C'), code =Sys.time())))Code
message(paste0("Editor context: ", path))Error: objeto ‘path’ no encontrado
Code
cat("quarto version: "); quarto::quarto_version()quarto version:
[1] '1.7.29'
Code
sesion_info <- devtools::session_info()Warning in system2(“quarto”, “-V”, stdout = TRUE, env = paste0(“TMPDIR=”, : el comando ejecutado ‘“quarto” TMPDIR=C:/Users/andre/AppData/Local/Temp/RtmpI5cztq/file820442a5662 -V’ tiene el estatus 1
Code
dplyr::select(
tibble::as_tibble(sesion_info$packages),
c(package, loadedversion, source)
)|>
DT::datatable(filter = 'top', colnames = c('Row number' =1,'Package' = 2, 'Version'= 3),
caption = htmltools::tags$caption(
style = 'caption-side: top; text-align: left;',
'', htmltools::em('R packages')),
options=list(
initComplete = htmlwidgets::JS(
"function(settings, json) {",
"$(this.api().tables().body()).css({
'font-family': 'Helvetica Neue',
'font-size': '70%',
'code-inline-font-size': '15%',
'white-space': 'nowrap',
'line-height': '0.75em',
'min-height': '0.5em'
});",
"}")))Code
#|echo: true
#|error: true
#|message: true
#|paged.print: true
#|class-output: center-table
reticulate::py_list_packages()|>
DT::datatable(filter = 'top', colnames = c('Row number' =1,'Package' = 2, 'Version'= 3),
caption = htmltools::tags$caption(
style = 'caption-side: top; text-align: left;',
'', htmltools::em('Python packages')),
options=list(
initComplete = htmlwidgets::JS(
"function(settings, json) {",
"$(this.api().tables().body()).css({
'font-family': 'Helvetica Neue',
'font-size': '70%',
'code-inline-font-size': '15%',
'white-space': 'nowrap',
'line-height': '0.75em',
'min-height': '0.5em'
});",
"}"))) Warning in system2(python, args, stdout = TRUE): el comando ejecutado ‘“G:/My Drive/Alvacast/SISTRAT 2023/.mamba_root/envs/py311/python.exe” -m pip freeze’ tiene el estatus 1
Save
Code
wdpath<-
paste0(gsub("/cons","",gsub("cons","",paste0(getwd(),"/cons"))))
envpath<- if(regmatches(wdpath, regexpr("[A-Za-z]+", wdpath))=="G"){"G:/Mi unidad/Alvacast/SISTRAT 2023/"}else{"E:/Mi unidad/Alvacast/SISTRAT 2023/"}
paste0(getwd(),"/cons")
file.path(paste0(wdpath,"data/20241015_out"))
file.path(paste0(envpath,"data/20241015_out"))
# Save
rdata_path <- file.path(wdpath, "data/20241015_out/psu", paste0("psu_ndp_", format(Sys.time(), "%Y_%m_%d"), ".Rdata"))
save.image(rdata_path)
cat("Saved in:",
rdata_path)
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
time_after_dedup2<-Sys.time()
paste0("Time in markdown: ");time_after_dedup2-time_before_dedup2[1] "G:/My Drive/Alvacast/SISTRAT 2023/cons/cons"
[1] "G:/My Drive/Alvacast/SISTRAT 2023//data/20241015_out"
[1] "G:/Mi unidad/Alvacast/SISTRAT 2023/data/20241015_out"
Saved in: G:/My Drive/Alvacast/SISTRAT 2023///data/20241015_out/psu/psu_ndp_2025_09_25.Rdata[1] "Time in markdown: "
Time difference of 5.817105 mins